diff --git a/mysql-test/suite/innodb/r/cats-autoinc.result b/mysql-test/suite/innodb/r/cats-autoinc.result index d1f79d312c44..16332890746c 100644 --- a/mysql-test/suite/innodb/r/cats-autoinc.result +++ b/mysql-test/suite/innodb/r/cats-autoinc.result @@ -51,3 +51,41 @@ SET @@global.innodb_lock_wait_timeout = @innodb_lock_wait_timeout_saved; # End of Bug #27944920 # # # ######################## +CREATE TABLE t1 ( +id INT PRIMARY KEY AUTO_INCREMENT, +val INT +) Engine=InnoDB; +CREATE TABLE t2 ( +id INT PRIMARY KEY +) Engine=InnoDB; +CREATE TABLE t3 ( +id INT PRIMARY KEY, +val INT +) Engine=InnoDB; +INSERT INTO t1 (id, val) VALUES (1,1); +INSERT INTO t2 (id) VALUES (1),(2),(3); +INSERT INTO t3 (id, val) VALUES (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +SET @innodb_lock_wait_timeout_saved = @@global.innodb_lock_wait_timeout; +SET @@global.innodb_lock_wait_timeout = 100000; +BEGIN; +SELECT * FROM t2 WHERE id=2 FOR UPDATE; +id +2 +BEGIN; +SELECT * FROM t2 WHERE id=3 FOR UPDATE; +id +3 +BEGIN; +UPDATE t3 SET val = 13; +SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL C2_will_wait'; +INSERT INTO t1 (val) SELECT id FROM t2; +SET DEBUG_SYNC = 'now WAIT_FOR C2_will_wait'; +SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL C3_will_wait'; +INSERT INTO t1 (val) VALUES (13);; +SET DEBUG_SYNC = 'now WAIT_FOR C3_will_wait'; +ROLLBACK; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +ROLLBACK; +ROLLBACK; +DROP TABLES t1,t2,t3; +SET @@global.innodb_lock_wait_timeout = @innodb_lock_wait_timeout_saved; diff --git a/mysql-test/suite/innodb/r/lock_rec_unlock.result b/mysql-test/suite/innodb/r/lock_rec_unlock.result index 886a54782feb..6d8ec5da9ea2 100644 --- a/mysql-test/suite/innodb/r/lock_rec_unlock.result +++ b/mysql-test/suite/innodb/r/lock_rec_unlock.result @@ -59,3 +59,25 @@ SET DEBUG_SYNC = 'RESET'; # End of Bug #27898384 # # # ######################## +# Bug #31046834 ASSERTION FAILURE: TRX0TRX.CC:2663:TRX_ALLOWED_TWO_LATCHES THREAD 14024410520550 +# Bug #31047326 ASSERTION FAILURE: TRX0TRX.CC:2663:TRX_ALLOWED_2_LATCHES THREAD 139840853837568 +CREATE TABLE t1 ( +id INT PRIMARY KEY, +val INT +) Engine=InnoDB; +INSERT INTO t1 (id, val) VALUES (1,1); +SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +BEGIN; +SET DEBUG_SYNC = 'after_lock_clust_rec_read_check_and_lock SIGNAL con1_created_lock WAIT_FOR con2_will_wait'; +SELECT * FROM t1 WHERE val=13 FOR UPDATE; +SET DEBUG_SYNC = 'now WAIT_FOR con1_created_lock'; +BEGIN; +SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL con2_will_wait'; +SELECT * FROM t1 WHERE id=1 FOR UPDATE; +id val +COMMIT; +id val +1 1 +COMMIT; +DROP TABLE t1; +SET DEBUG_SYNC = 'RESET'; diff --git a/mysql-test/suite/innodb/r/lock_sys_resize.result b/mysql-test/suite/innodb/r/lock_sys_resize.result new file mode 100644 index 000000000000..33fe134225e6 --- /dev/null +++ b/mysql-test/suite/innodb/r/lock_sys_resize.result @@ -0,0 +1,31 @@ +# Bug #31329634 ASSERTION FAILURE: +# LOCK0LATCHES.CC:42:LOCK_SYS->REC_HASH->N_CELLS == LOCK_SYS->P +SELECT @@innodb_buffer_pool_size; +@@innodb_buffer_pool_size +17825792 +SELECT @@innodb_buffer_pool_chunk_size; +@@innodb_buffer_pool_chunk_size +1048576 +CREATE TABLE t1 (id INT PRIMARY KEY, val VARCHAR(1000)) ENGINE=INNODB; +INSERT INTO t1 (id,val) VALUES (1,''),(2,''),(3,''),(4,''),(5,''); +SET DEBUG_SYNC='lock_rec_restore_from_page_infimum_will_latch + SIGNAL con1_will_latch + WAIT_FOR con1_can_go'; +UPDATE t1 SET val='aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +SET DEBUG_SYNC='now WAIT_FOR con1_will_latch'; +SET GLOBAL DEBUG='+d,syncpoint_after_lock_sys_resize_rec_hash'; +SET GLOBAL innodb_buffer_pool_size= +@@innodb_buffer_pool_size * 2 + @@innodb_buffer_pool_chunk_size; +SET DEBUG_SYNC='now WAIT_FOR reached_after_lock_sys_resize_rec_hash'; +SET DEBUG_SYNC='now SIGNAL con1_can_go'; +SET GLOBAL DEBUG='-d,syncpoint_after_lock_sys_resize_rec_hash'; +SET DEBUG_SYNC='now SIGNAL continue_after_lock_sys_resize_rec_hash'; +DROP TABLE t1; +SELECT @@innodb_buffer_pool_size; +@@innodb_buffer_pool_size +36700160 +SET GLOBAL innodb_buffer_pool_size= +(@@innodb_buffer_pool_size - @@innodb_buffer_pool_chunk_size) div 2; +SELECT @@innodb_buffer_pool_size; +@@innodb_buffer_pool_size +17825792 diff --git a/mysql-test/suite/innodb/r/lock_trx_release_read_locks_in_x_mode.result b/mysql-test/suite/innodb/r/lock_trx_release_read_locks_in_x_mode.result new file mode 100644 index 000000000000..676aaa3c80dc --- /dev/null +++ b/mysql-test/suite/innodb/r/lock_trx_release_read_locks_in_x_mode.result @@ -0,0 +1,68 @@ +CREATE TABLE t0 (id INT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t1 (id INT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t2 (id INT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t3 (id INT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t4 (id INT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t5 (id INT PRIMARY KEY) ENGINE=InnoDB; +INSERT INTO t0 (id) VALUES (1); +SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +XA START 'x'; +INSERT INTO t1 (id) VALUES (1);; +INSERT INTO t2 (id) VALUES (1);; +INSERT INTO t3 (id) VALUES (1);; +INSERT INTO t4 (id) VALUES (1);; +INSERT INTO t5 (id) VALUES (1);; +SELECT * FROM t0 WHERE id=1 FOR UPDATE; +id +1 +XA END 'x'; +SET DEBUG_SYNC='lock_trx_release_read_locks_in_x_mode_will_release + SIGNAL c0_releases_in_xmode'; +SET DEBUG_SYNC='try_relatch_trx_and_shard_and_do_noted_expected_version + SIGNAL c0_noted_expected_version + WAIT_FOR c0_can_go + EXECUTE 5'; +XA PREPARE 'x'; +BEGIN; +SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; +SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go'; +SELECT * FROM t1 FOR SHARE; +BEGIN; +SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; +SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go'; +SELECT * FROM t2 FOR SHARE; +BEGIN; +SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; +SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go'; +SELECT * FROM t3 FOR SHARE; +BEGIN; +SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; +SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go'; +SELECT * FROM t4 FOR SHARE; +BEGIN; +SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; +SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go'; +SELECT * FROM t5 FOR SHARE; +SET DEBUG_SYNC='now WAIT_FOR c0_releases_in_xmode'; +XA COMMIT 'x'; +id +1 +COMMIT; +id +1 +COMMIT; +id +1 +COMMIT; +id +1 +COMMIT; +id +1 +COMMIT; +DROP TABLE t0; +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP TABLE t4; +DROP TABLE t5; diff --git a/mysql-test/suite/innodb/r/rec_offsets.result b/mysql-test/suite/innodb/r/rec_offsets.result new file mode 100644 index 000000000000..54ce07efc121 --- /dev/null +++ b/mysql-test/suite/innodb/r/rec_offsets.result @@ -0,0 +1,20 @@ +CREATE TABLE t ( +id INT PRIMARY KEY, +c0 INT, c1 INT, c2 INT, c3 INT, c4 INT, c5 INT, c6 INT, c7 INT, c8 INT, c9 INT, +c10 INT, c11 INT, c12 INT, c13 INT, c14 INT, c15 INT, c16 INT, c17 INT, c18 INT, c19 INT, +c20 INT, c21 INT, c22 INT, c23 INT, c24 INT, c25 INT, c26 INT, c27 INT, c28 INT, c29 INT, +c30 INT, c31 INT, c32 INT, c33 INT, c34 INT, c35 INT, c36 INT, c37 INT, c38 INT, c39 INT, +c40 INT, c41 INT, c42 INT, c43 INT, c44 INT, c45 INT, c46 INT, c47 INT, c48 INT, c49 INT, +c50 INT, c51 INT, c52 INT, c53 INT, c54 INT, c55 INT, c56 INT, c57 INT, c58 INT, c59 INT, +c60 INT, c61 INT, c62 INT, c63 INT, c64 INT, c65 INT, c66 INT, c67 INT, c68 INT, c69 INT, +c70 INT, c71 INT, c72 INT, c73 INT, c74 INT, c75 INT, c76 INT, c77 INT, c78 INT, c79 INT, +c80 INT, c81 INT, c82 INT, c83 INT, c84 INT, c85 INT, c86 INT, c87 INT, c88 INT, c89 INT, +c90 INT, c91 INT, c92 INT, c93 INT, c94 INT, c95 INT, c96 INT, c97 INT, c98 INT, c99 INT, +c100 INT UNIQUE KEY +) ENGINE=InnoDB; +BEGIN; +INSERT INTO t (id,c100) VALUES (1,1); +INSERT INTO t (id,c100) VALUES (2,1); +ERROR 23000: Duplicate entry '1' for key 't.c100' +COMMIT; +DROP TABLE t; diff --git a/mysql-test/suite/innodb/t/cats-autoinc.test b/mysql-test/suite/innodb/t/cats-autoinc.test index dd2a2182843f..1df63fce0b60 100644 --- a/mysql-test/suite/innodb/t/cats-autoinc.test +++ b/mysql-test/suite/innodb/t/cats-autoinc.test @@ -136,3 +136,97 @@ --echo # End of Bug #27944920 # --echo # # --echo ######################## + +# Following scenario is intended to cover the rare case of trx being +# killed while waiting for a table lock, which excersises the table +# lock case in lock_cancel_waiting_and_release function. +# +# To generate a situation when trx is waiting for a table lock inside +# InnoDB we use following scenario: +# C1 locks t2.id = 2 +# C3 locks t2.id = 3 +# C2 obtains t1.AUTO_INC and waits for C1 t2.id=2 row lock +# C3 tries to insert to t1, and has to wait for C2's autoinc lock +# C1 rolls back, which unlocks t2.id=2, and C2 proceeds to lock t2.id=3, +# and now is blocked by C3, but C3 is already blocked by C2, so we have +# a deadlock cycle. +# We make C2 heavy to make sure that C3 is chosen as victim, by modyfing +# many rows in t3. + + CREATE TABLE t1 ( + id INT PRIMARY KEY AUTO_INCREMENT, + val INT + ) Engine=InnoDB; + + CREATE TABLE t2 ( + id INT PRIMARY KEY + ) Engine=InnoDB; + + CREATE TABLE t3 ( + id INT PRIMARY KEY, + val INT + ) Engine=InnoDB; + + INSERT INTO t1 (id, val) VALUES (1,1); + INSERT INTO t2 (id) VALUES (1),(2),(3); + INSERT INTO t3 (id, val) VALUES (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + + # Save the original settings, to be restored at the end of test + SET @innodb_lock_wait_timeout_saved = @@global.innodb_lock_wait_timeout; + + # Make sure that transactions will not finish prematurely + SET @@global.innodb_lock_wait_timeout = 100000; + + + --connect (C1, localhost, root,,) + --connect (C2, localhost, root,,) + --connect (C3, localhost, root,,) + + --connection C1 + BEGIN; + SELECT * FROM t2 WHERE id=2 FOR UPDATE; + + --connection C3 + BEGIN; + SELECT * FROM t2 WHERE id=3 FOR UPDATE; + + --connection C2 + BEGIN; + UPDATE t3 SET val = 13; + SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL C2_will_wait'; + --send INSERT INTO t1 (val) SELECT id FROM t2 + # C2 --waits-for[t2.id=2]--> C1 + + --connection C3 + SET DEBUG_SYNC = 'now WAIT_FOR C2_will_wait'; + SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL C3_will_wait'; + --send INSERT INTO t1 (val) VALUES (13); + # C3 --waits-for[t1.autoinc]--> C2 --waits-for[t2.id=2]--> C1 + + --connection C1 + SET DEBUG_SYNC = 'now WAIT_FOR C3_will_wait'; + ROLLBACK; + # C3 --waits-for[t1.autoinc]--> C2 --waits-for[t2.id=3]--> C3 + # this is a deadlock. + + --connection C3 + --error ER_LOCK_DEADLOCK + --reap + ROLLBACK; + + --connection C2 + --reap + ROLLBACK; + + --connection default + --disconnect C1 + --disconnect C2 + --disconnect C3 + + + DROP TABLES t1,t2,t3; + + # Restore saved state + + SET @@global.innodb_lock_wait_timeout = @innodb_lock_wait_timeout_saved; + diff --git a/mysql-test/suite/innodb/t/lock_rec_unlock.test b/mysql-test/suite/innodb/t/lock_rec_unlock.test index cb2000c02c2e..ae641cf66ae1 100644 --- a/mysql-test/suite/innodb/t/lock_rec_unlock.test +++ b/mysql-test/suite/innodb/t/lock_rec_unlock.test @@ -1,4 +1,5 @@ --source include/have_debug_sync.inc +--source include/count_sessions.inc --echo ################################################################# --echo # # @@ -139,3 +140,47 @@ --echo # End of Bug #27898384 # --echo # # --echo ######################## + + +--echo # Bug #31046834 ASSERTION FAILURE: TRX0TRX.CC:2663:TRX_ALLOWED_TWO_LATCHES THREAD 14024410520550 +--echo # Bug #31047326 ASSERTION FAILURE: TRX0TRX.CC:2663:TRX_ALLOWED_2_LATCHES THREAD 139840853837568 + + CREATE TABLE t1 ( + id INT PRIMARY KEY, + val INT + ) Engine=InnoDB; + INSERT INTO t1 (id, val) VALUES (1,1); + + --connect (con1, localhost, root,,) + --connect (con2, localhost, root,,) + + --connection con1 + SET TRANSACTION ISOLATION LEVEL READ COMMITTED; + BEGIN; + SET DEBUG_SYNC = 'after_lock_clust_rec_read_check_and_lock SIGNAL con1_created_lock WAIT_FOR con2_will_wait'; + --send SELECT * FROM t1 WHERE val=13 FOR UPDATE + + --connection con2 + SET DEBUG_SYNC = 'now WAIT_FOR con1_created_lock'; + BEGIN; + SET DEBUG_SYNC = 'lock_wait_will_wait SIGNAL con2_will_wait'; + --send SELECT * FROM t1 WHERE id=1 FOR UPDATE + + --connection con1 + --reap + COMMIT; + + --connection con2 + --reap + COMMIT; + + + + # Clean up: + --connection default + --disconnect con1 + --disconnect con2 + DROP TABLE t1; + SET DEBUG_SYNC = 'RESET'; + +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb/t/lock_sys_resize-master.opt b/mysql-test/suite/innodb/t/lock_sys_resize-master.opt new file mode 100644 index 000000000000..4f833433c9b9 --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_sys_resize-master.opt @@ -0,0 +1,2 @@ +--innodb_buffer-pool-size=17825792 +--innodb-buffer-pool-chunk-size=1048576 diff --git a/mysql-test/suite/innodb/t/lock_sys_resize.test b/mysql-test/suite/innodb/t/lock_sys_resize.test new file mode 100644 index 000000000000..a7dfd3e11a9f --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_sys_resize.test @@ -0,0 +1,76 @@ +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/count_sessions.inc + +--echo # Bug #31329634 ASSERTION FAILURE: +--echo # LOCK0LATCHES.CC:42:LOCK_SYS->REC_HASH->N_CELLS == LOCK_SYS->P + +# The two values in master-opt were chosen so that the following +# SET GLOBAL innodb_buffer_pool_size= ... +# will succeed, and will resize lock_sys in parallel to the UPDATE. +# (As opposed to say, block, as is the case when it shrinks instead of growing) +# Also it must be larger than BUF_LRU_MIN_LEN pages, as otherwise BP shrink will +# not be able to finish as it will try to keep BUF_LRU_MIN_LEN pages in BP. +SELECT @@innodb_buffer_pool_size; +SELECT @@innodb_buffer_pool_chunk_size; + +CREATE TABLE t1 (id INT PRIMARY KEY, val VARCHAR(1000)) ENGINE=INNODB; +INSERT INTO t1 (id,val) VALUES (1,''),(2,''),(3,''),(4,''),(5,''); + + +--connect (con1,localhost,root,,) + SET DEBUG_SYNC='lock_rec_restore_from_page_infimum_will_latch + SIGNAL con1_will_latch + WAIT_FOR con1_can_go'; + # This will cause resize of records and require calls to + # lock_rec_restore_from_page_infimum() which exercise Shard_latches_guard + --send UPDATE t1 SET val='aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + +--connect (con2,localhost,root,,) + SET DEBUG_SYNC='now WAIT_FOR con1_will_latch'; + # resize happens in a background thread so we need to enable sync point + SET GLOBAL DEBUG='+d,syncpoint_after_lock_sys_resize_rec_hash'; + SET GLOBAL innodb_buffer_pool_size= + @@innodb_buffer_pool_size * 2 + @@innodb_buffer_pool_chunk_size; + +--connection default + SET DEBUG_SYNC='now WAIT_FOR reached_after_lock_sys_resize_rec_hash'; + SET DEBUG_SYNC='now SIGNAL con1_can_go'; + # This is the moment where con1 could observe assertion failure + SET GLOBAL DEBUG='-d,syncpoint_after_lock_sys_resize_rec_hash'; + SET DEBUG_SYNC='now SIGNAL continue_after_lock_sys_resize_rec_hash'; + +--connection con1 + --reap + + +--connection default +--disconnect con1 +--disconnect con2 + +DROP TABLE t1; + +# Make sure we finish previous resizing before issuing another +let $wait_timeout = 60; +let $wait_condition = + SELECT SUBSTR(variable_value, 1, 9) = 'Completed' + FROM performance_schema.global_status + WHERE variable_name = 'innodb_buffer_pool_resize_status'; +--source include/wait_condition.inc + +SELECT @@innodb_buffer_pool_size; + +# Restore original smaller size +SET GLOBAL innodb_buffer_pool_size= + (@@innodb_buffer_pool_size - @@innodb_buffer_pool_chunk_size) div 2; +# Make sure we finish resizing and restore original state before ending +let $wait_timeout = 60; +let $wait_condition = + SELECT SUBSTR(variable_value, 1, 9) = 'Completed' + FROM performance_schema.global_status + WHERE variable_name = 'innodb_buffer_pool_resize_status'; +--source include/wait_condition.inc + +SELECT @@innodb_buffer_pool_size; + +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb/t/lock_trx_release_read_locks_in_x_mode.test b/mysql-test/suite/innodb/t/lock_trx_release_read_locks_in_x_mode.test new file mode 100644 index 000000000000..24dfc31882dd --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_trx_release_read_locks_in_x_mode.test @@ -0,0 +1,84 @@ +--source include/have_debug_sync.inc +--source include/count_sessions.inc + +# This test scenario exercises a rare case in which READ COMMITTED +# (or UNCOMMITTED) transaction tries to release read locks during PREPARE stage +# (as is typical for XA or in group replication) and is interupted MAX_FAILURES +# times by other transactions when trying to iterate over its own list of locks. +# The other transactions are converting implicit locks of the transaction to +# explicit, adding the explicit locks to the list the transaction is iterating +# over, so it has to restart. Finally the transaction gives up with s-latching +# and attempts to x-latch the whole lock-sys to get job done. + +# keep in sync with MAX_FAILURES defined in lock_trx_release_read_locks() +--let MAX_FAILURES=5 +# We create one extra table +--let i=0 +while($i<=$MAX_FAILURES) +{ + --eval CREATE TABLE t$i (id INT PRIMARY KEY) ENGINE=InnoDB + --inc $i +} +# We will need this row to create explicit lock on it from c0 +INSERT INTO t0 (id) VALUES (1); + +--connect (c0, localhost, root,,) + SET TRANSACTION ISOLATION LEVEL READ COMMITTED; + XA START 'x'; + # create at least MAX_FAILURES implicit locks + --let i=1 + while($i<=$MAX_FAILURES) + { + --eval INSERT INTO t$i (id) VALUES (1); + --inc $i + } + # create at least 1 explicit lock + SELECT * FROM t0 WHERE id=1 FOR UPDATE; + XA END 'x'; + SET DEBUG_SYNC='lock_trx_release_read_locks_in_x_mode_will_release + SIGNAL c0_releases_in_xmode'; + SET DEBUG_SYNC='try_relatch_trx_and_shard_and_do_noted_expected_version + SIGNAL c0_noted_expected_version + WAIT_FOR c0_can_go + EXECUTE 5'; + --send XA PREPARE 'x' + +--let i=1 +while($i<=$MAX_FAILURES) +{ + --connect (c$i, localhost, root,,) + BEGIN; + SET DEBUG_SYNC = 'now WAIT_FOR c0_noted_expected_version'; + --eval SET DEBUG_SYNC='lock_wait_will_wait SIGNAL c0_can_go' + --send_eval SELECT * FROM t$i FOR SHARE + + --inc $i +} + +--connection default +SET DEBUG_SYNC='now WAIT_FOR c0_releases_in_xmode'; + +--connection c0 + --reap + XA COMMIT 'x'; + +--disconnect c0 +--let i=1 +while($i<=$MAX_FAILURES) +{ + --connection c$i + --reap + COMMIT; + --connection default + --disconnect c$i + --inc $i +} + +--let i=0 +while($i<=$MAX_FAILURES) +{ + --eval DROP TABLE t$i + --inc $i +} + +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb/t/rec_offsets.test b/mysql-test/suite/innodb/t/rec_offsets.test new file mode 100644 index 000000000000..a02419902c54 --- /dev/null +++ b/mysql-test/suite/innodb/t/rec_offsets.test @@ -0,0 +1,31 @@ +# More than 100 columns for sure will overflow REC_OFFS_NORMAL_SIZE. + +CREATE TABLE t ( + id INT PRIMARY KEY, + c0 INT, c1 INT, c2 INT, c3 INT, c4 INT, c5 INT, c6 INT, c7 INT, c8 INT, c9 INT, + c10 INT, c11 INT, c12 INT, c13 INT, c14 INT, c15 INT, c16 INT, c17 INT, c18 INT, c19 INT, + c20 INT, c21 INT, c22 INT, c23 INT, c24 INT, c25 INT, c26 INT, c27 INT, c28 INT, c29 INT, + c30 INT, c31 INT, c32 INT, c33 INT, c34 INT, c35 INT, c36 INT, c37 INT, c38 INT, c39 INT, + c40 INT, c41 INT, c42 INT, c43 INT, c44 INT, c45 INT, c46 INT, c47 INT, c48 INT, c49 INT, + c50 INT, c51 INT, c52 INT, c53 INT, c54 INT, c55 INT, c56 INT, c57 INT, c58 INT, c59 INT, + c60 INT, c61 INT, c62 INT, c63 INT, c64 INT, c65 INT, c66 INT, c67 INT, c68 INT, c69 INT, + c70 INT, c71 INT, c72 INT, c73 INT, c74 INT, c75 INT, c76 INT, c77 INT, c78 INT, c79 INT, + c80 INT, c81 INT, c82 INT, c83 INT, c84 INT, c85 INT, c86 INT, c87 INT, c88 INT, c89 INT, + c90 INT, c91 INT, c92 INT, c93 INT, c94 INT, c95 INT, c96 INT, c97 INT, c98 INT, c99 INT, + c100 INT UNIQUE KEY +) ENGINE=InnoDB; + +# In this test we exercise the nontrivial case in lock_rec_convert_impl_to_expl_for_trx +# being called from row_convert_impl_to_expl_if_needed without precomputed offsets and +# requiring more than REC_OFFS_NORMAL_SIZE to be allocated. +# For that we need to cause secondary unique index conflict, so that the trx has to +# rollback, but only to save_point (that is: only rollback single query, not whole trx). + +BEGIN; +INSERT INTO t (id,c100) VALUES (1,1); +--error ER_DUP_ENTRY +INSERT INTO t (id,c100) VALUES (2,1); +COMMIT; + +DROP TABLE t; + diff --git a/mysql-test/suite/perfschema/r/sxlock_func.result b/mysql-test/suite/perfschema/r/sxlock_func.result index d38b80a06ae6..4c04ded433d1 100644 --- a/mysql-test/suite/perfschema/r/sxlock_func.result +++ b/mysql-test/suite/perfschema/r/sxlock_func.result @@ -20,6 +20,7 @@ wait/synch/sxlock/innodb/fts_cache_rw_lock wait/synch/sxlock/innodb/hash_table_locks wait/synch/sxlock/innodb/index_online_log wait/synch/sxlock/innodb/index_tree_rw_lock +wait/synch/sxlock/innodb/lock_sys_global_rw_lock wait/synch/sxlock/innodb/log_sn_lock wait/synch/sxlock/innodb/rsegs_lock wait/synch/sxlock/innodb/trx_i_s_cache_lock diff --git a/share/messages_to_error_log.txt b/share/messages_to_error_log.txt index 82a27b57c17e..c643313f0446 100644 --- a/share/messages_to_error_log.txt +++ b/share/messages_to_error_log.txt @@ -7451,10 +7451,10 @@ ER_IB_MSG_638 ER_IB_MSG_639 eng "%s" -ER_IB_MSG_640 +OBSOLETE_ER_IB_MSG_640 eng "%s" -ER_IB_MSG_641 +OBSOLETE_ER_IB_MSG_641 eng "%s" ER_IB_MSG_642 diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 882c8085bad3..91d6ba15af75 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2006, 2020, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License, version 2.0, @@ -135,8 +135,11 @@ SET(INNOBASE_SOURCES lob/zlob0update.cc lob/zlob0first.cc lob/zlob0read.cc + lock/lock0guards.cc lock/lock0iter.cc lock/lock0prdt.cc + lock/lock0aarch64_atomic.cc + lock/lock0latches.cc lock/lock0lock.cc lock/lock0wait.cc log/log0buf.cc @@ -262,3 +265,8 @@ IF(HAS_WARN_FLAG) ADD_COMPILE_FLAGS(fts/fts0pars.cc COMPILE_FLAGS "${HAS_WARN_FLAG}") ENDIF() + +ADD_COMPILE_FLAGS( + lock/lock0aarch64_atomic.cc + COMPILE_FLAGS "-march=armv8-a+lse" +) diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 040fac51caa4..e343371205e9 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2020, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under @@ -2951,9 +2951,8 @@ static buf_block_t *btr_lift_page_up( if (!dict_table_is_locking_disabled(index->table)) { /* Free predicate page locks on the block */ if (dict_index_is_spatial(index)) { - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash); - lock_mutex_exit(); } lock_update_copy_and_discard(father_block, block); } @@ -3220,10 +3219,9 @@ ibool btr_compress( } /* No GAP lock needs to be worrying about */ - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash); lock_rec_free_all_from_discard_page(block); - lock_mutex_exit(); } else { btr_node_ptr_delete(index, block, mtr); if (!dict_table_is_locking_disabled(index->table)) { @@ -3355,10 +3353,9 @@ ibool btr_compress( rtr_merge_and_update_mbr(&cursor2, &father_cursor, offsets2, offsets, merge_page, merge_block, block, index, mtr); } - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash); lock_rec_free_all_from_discard_page(block); - lock_mutex_exit(); } else { compressed = btr_cur_pessimistic_delete( &err, TRUE, &cursor2, BTR_CREATE_FLAG, false, 0, 0, 0, mtr); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 25aec18f5bd5..0edbc98dda17 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2140,28 +2140,30 @@ static void buf_pool_resize() { message_interval *= 2; } - lock_mutex_enter(); - trx_sys_mutex_enter(); - bool found = false; - for (trx_t *trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); - trx != nullptr; trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) { - if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != nullptr && - ut_difftime(withdraw_started, trx->start_time) > 0) { - if (!found) { - ib::warn(ER_IB_MSG_61) << "The following trx might hold" - " the blocks in buffer pool to" - " be withdrawn. Buffer pool" - " resizing can complete only" - " after all the transactions" - " below release the blocks."; - found = true; - } + { + /* lock_trx_print_wait_and_mvcc_state() requires exclusive global latch */ + locksys::Global_exclusive_latch_guard guard{}; + trx_sys_mutex_enter(); + bool found = false; + for (trx_t *trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + trx != nullptr; trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) { + if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != nullptr && + ut_difftime(withdraw_started, trx->start_time) > 0) { + if (!found) { + ib::warn(ER_IB_MSG_61) << "The following trx might hold" + " the blocks in buffer pool to" + " be withdrawn. Buffer pool" + " resizing can complete only" + " after all the transactions" + " below release the blocks."; + found = true; + } - lock_trx_print_wait_and_mvcc_state(stderr, trx); + lock_trx_print_wait_and_mvcc_state(stderr, trx); + } } + trx_sys_mutex_exit(); } - trx_sys_mutex_exit(); - lock_mutex_exit(); withdraw_started = ut_time(); } @@ -4404,14 +4406,6 @@ bool buf_page_get_known_nowait(ulint rw_latch, buf_block_t *block, return (true); } -/** Given a tablespace id and page number tries to get that page. If the -page is not in the buffer pool it is not loaded and NULL is returned. -Suitable for using when holding the lock_sys_t::mutex. -@param[in] page_id page id -@param[in] file file name -@param[in] line line where called -@param[in] mtr mini-transaction -@return pointer to a page or NULL */ const buf_block_t *buf_page_try_get_func(const page_id_t &page_id, const char *file, ulint line, mtr_t *mtr) { diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 45fe60f126db..c557b9862fea 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -745,12 +745,12 @@ class Batch_segment : public Segment { /** The instance that is being written to disk. */ Double_write *m_dblwr{}; - byte m_pad1[INNOBASE_CACHE_LINE_SIZE]; + byte m_pad1[ut::INNODB_CACHE_LINE_SIZE]; /** Size of the batch. */ std::atomic_int m_batch_size{}; - byte m_pad2[INNOBASE_CACHE_LINE_SIZE]; + byte m_pad2[ut::INNODB_CACHE_LINE_SIZE]; /** Number of pages to write. */ std::atomic_int m_written{}; diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 757d8d70b398..f7b15d584c96 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -1885,7 +1885,7 @@ static void dict_table_remove_from_cache_low( ut_ad(table); ut_ad(dict_lru_validate()); ut_a(table->get_ref_count() == 0); - ut_a(table->n_rec_locks == 0); + ut_a(table->n_rec_locks.load() == 0); ut_ad(mutex_own(&dict_sys->mutex)); ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); diff --git a/storage/innobase/dict/mem.cc b/storage/innobase/dict/mem.cc index b342d66ba4cb..6544ff392216 100644 --- a/storage/innobase/dict/mem.cc +++ b/storage/innobase/dict/mem.cc @@ -39,7 +39,9 @@ external tools. */ #include "dict0dict.h" #ifndef UNIV_HOTBACKUP +#ifndef UNIV_LIBRARY #include "lock0lock.h" +#endif /* !UNIV_LIBRARY */ #endif /* !UNIV_HOTBACKUP */ /** Append 'name' to 'col_names'. @see dict_table_t::col_names diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 3b55bde6475d..0871677693aa 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -1123,10 +1123,9 @@ void rtr_check_discard_page( mutex_exit(&index->rtr_track->rtr_active_mutex); - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; lock_prdt_page_free_from_discard(block, lock_sys->prdt_hash); lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash); - lock_mutex_exit(); } /** Restore the stored position of a persistent cursor bufferfixing the page */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 146f4401b6f6..fc194d2e4939 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -663,7 +663,8 @@ static PSI_mutex_info all_innodb_mutexes[] = { PSI_MUTEX_KEY(trx_pool_manager_mutex, 0, 0, PSI_DOCUMENT_ME), PSI_MUTEX_KEY(temp_pool_manager_mutex, 0, 0, PSI_DOCUMENT_ME), PSI_MUTEX_KEY(srv_sys_mutex, 0, 0, PSI_DOCUMENT_ME), - PSI_MUTEX_KEY(lock_mutex, 0, 0, PSI_DOCUMENT_ME), + PSI_MUTEX_KEY(lock_sys_page_mutex, 0, 0, PSI_DOCUMENT_ME), + PSI_MUTEX_KEY(lock_sys_table_mutex, 0, 0, PSI_DOCUMENT_ME), PSI_MUTEX_KEY(lock_wait_mutex, 0, 0, PSI_DOCUMENT_ME), PSI_MUTEX_KEY(trx_mutex, 0, 0, PSI_DOCUMENT_ME), PSI_MUTEX_KEY(srv_threads_mutex, 0, 0, PSI_DOCUMENT_ME), @@ -699,6 +700,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = { PSI_RWLOCK_KEY(log_sn_lock, 0, PSI_DOCUMENT_ME), PSI_RWLOCK_KEY(undo_spaces_lock, 0, PSI_DOCUMENT_ME), PSI_RWLOCK_KEY(rsegs_lock, 0, PSI_DOCUMENT_ME), + PSI_RWLOCK_KEY(lock_sys_global_rw_lock, 0, PSI_DOCUMENT_ME), PSI_RWLOCK_KEY(fts_cache_rw_lock, 0, PSI_DOCUMENT_ME), PSI_RWLOCK_KEY(fts_cache_init_rw_lock, 0, PSI_DOCUMENT_ME), PSI_RWLOCK_KEY(trx_i_s_cache_lock, 0, PSI_DOCUMENT_ME), @@ -5515,25 +5517,13 @@ static bool innobase_rollback_to_savepoint_can_release_mdl( TrxInInnoDB trx_in_innodb(trx); - /* If transaction has not acquired any locks then it is safe - to release MDL after rollback to savepoint. - We assume that we are in the thread which is running the transaction, and - we check the length of this list without holding trx->mutex nor lock_sys - exclusive latch, so at least in theory other threads can concurrently modify - this list. However, such modifications are either implicit-to-explicit - conversions (which is only possible if trx has any implicit locks, which in - turn requires that it has acquired at least one IX table lock, so the list - is not empty) or related to B-tree reorganization (which is always performed - by first making a copy of a lock and then removing the old lock, so the number - of locks can not drop to zero). So, if we are only interested in "emptiness" - of the list, we should get accurate result without holding any latch. */ + trx_mutex_enter(trx); ut_ad(thd == current_thd); ut_ad(trx->lock.wait_lock == nullptr); - if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) { - return true; - } + const bool has_no_locks = (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + trx_mutex_exit(trx); - return false; + return has_no_locks; } /** Release transaction savepoint name. diff --git a/storage/innobase/handler/p_s.cc b/storage/innobase/handler/p_s.cc index 84512db936da..4775fb71f541 100644 --- a/storage/innobase/handler/p_s.cc +++ b/storage/innobase/handler/p_s.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2016, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -382,7 +382,9 @@ static const trx_t *fetch_trx_in_trx_list(uint64_t filter_trx_immutable_id, trx_ut_list_t *trx_list) { const trx_t *trx; - ut_ad(lock_mutex_own()); + /* It is not obvious if and why we need lock_sys exclusive access, but we do + own exclusive latch here, so treat this assert more as a documentation */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); for (trx = UT_LIST_GET_FIRST(*trx_list); trx != nullptr; @@ -583,7 +585,8 @@ bool Innodb_data_lock_iterator::scan(PSI_server_data_lock_container *container, return true; } - lock_mutex_enter(); + /* We want locks reported in a single scan to be a consistent snapshot. */ + locksys::Global_exclusive_latch_guard guard{}; trx_sys_mutex_enter(); @@ -603,8 +606,6 @@ bool Innodb_data_lock_iterator::scan(PSI_server_data_lock_container *container, trx_sys_mutex_exit(); - lock_mutex_exit(); - return false; } @@ -629,7 +630,8 @@ bool Innodb_data_lock_iterator::fetch(PSI_server_data_lock_container *container, return true; } - lock_mutex_enter(); + /* scan_trx() requires exclusive global latch to iterate over locks of trx */ + locksys::Global_exclusive_latch_guard guard{}; trx_sys_mutex_enter(); @@ -646,8 +648,6 @@ bool Innodb_data_lock_iterator::fetch(PSI_server_data_lock_container *container, trx_sys_mutex_exit(); - lock_mutex_exit(); - return true; } @@ -666,7 +666,9 @@ size_t Innodb_data_lock_iterator::scan_trx_list( trx_id_t trx_id; size_t found = 0; - ut_ad(lock_mutex_own()); + /* We are about to scan over various locks of multiple transactions not + limited to any particular shard thus we need an exclusive latch on lock_sys */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); for (trx = UT_LIST_GET_FIRST(*trx_list); trx != nullptr; @@ -732,7 +734,7 @@ size_t Innodb_data_lock_iterator::scan_trx( ulint heap_no; int record_type; lock_t *wait_lock; - + ut_ad(locksys::owns_exclusive_global_latch()); wait_lock = trx->lock.wait_lock; trx_id = trx_get_id_for_print(trx); @@ -856,7 +858,8 @@ bool Innodb_data_lock_wait_iterator::scan( return true; } - lock_mutex_enter(); + /* We want locks reported in a single scan to be a consistent snapshot. */ + locksys::Global_exclusive_latch_guard guard{}; trx_sys_mutex_enter(); @@ -874,8 +877,6 @@ bool Innodb_data_lock_wait_iterator::scan( trx_sys_mutex_exit(); - lock_mutex_exit(); - return false; } @@ -915,7 +916,8 @@ bool Innodb_data_lock_wait_iterator::fetch( return true; } - lock_mutex_enter(); + /* scan_trx() requires exclusive global latch to iterate over locks of trx */ + locksys::Global_exclusive_latch_guard guard{}; trx_sys_mutex_enter(); @@ -934,8 +936,6 @@ bool Innodb_data_lock_wait_iterator::fetch( trx_sys_mutex_exit(); - lock_mutex_exit(); - return true; } @@ -952,7 +952,9 @@ size_t Innodb_data_lock_wait_iterator::scan_trx_list( trx_id_t trx_id; size_t found = 0; - ut_ad(lock_mutex_own()); + /* We are about to scan over various locks of multiple transactions not + limited to any particular shard thus we need an exclusive latch on lock_sys */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); for (trx = UT_LIST_GET_FIRST(*trx_list); trx != nullptr; @@ -1007,6 +1009,7 @@ size_t Innodb_data_lock_wait_iterator::scan_trx( const void *blocking_identity; char blocking_engine_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; size_t blocking_engine_lock_id_length; + ut_ad(locksys::owns_exclusive_global_latch()); lock_t *wait_lock = trx->lock.wait_lock; const lock_t *curr_lock; int requesting_record_type; diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 13e999a3f540..19f95b1cc0f5 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -401,7 +401,7 @@ bool buf_page_get_known_nowait(ulint rw_latch, buf_block_t *block, /** Given a tablespace id and page number tries to get that page. If the page is not in the buffer pool it is not loaded and NULL is returned. -Suitable for using when holding the lock_sys_t::mutex. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). @param[in] page_id page id @param[in] file file name @param[in] line line where called @@ -411,9 +411,9 @@ const buf_block_t *buf_page_try_get_func(const page_id_t &page_id, const char *file, ulint line, mtr_t *mtr); -/** Tries to get a page. -If the page is not in the buffer pool it is not loaded. Suitable for using -when holding the lock_sys_t::mutex. +/** Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). @param[in] page_id page identifier @param[in] mtr mini-transaction @return the page if in buffer pool, NULL if not */ @@ -1478,6 +1478,10 @@ struct buf_block_t { new mutex in InnoDB-5.1 to relieve contention on the buffer pool mutex */ BPageMutex mutex; + /** Get the page number and space id of the current buffer block. + @return page number of the current buffer block. */ + const page_id_t &get_page_id() const { return page.id; } + /** Get the page number of the current buffer block. @return page number of the current buffer block. */ page_no_t get_page_no() const { return (page.id.page_no()); } diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index a9637f79eae9..3244c634383e 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -1883,8 +1883,8 @@ detect this and will eventually quit sooner. */ /* The actual collection of tables locked during AUTOINC read/write is kept in trx_t. In order to quickly determine whether a transaction has locked the AUTOINC lock we keep a pointer to the transaction here in - the 'autoinc_trx' member. This is to avoid acquiring the - lock_sys_t::mutex and scanning the vector in trx_t. + the 'autoinc_trx' member. This is to avoid acquiring lock_sys latches and + scanning the vector in trx_t. When an AUTOINC lock has to wait, the corresponding lock instance is created on the trx lock heap rather than use the pre-allocated instance in autoinc_lock below. */ @@ -1933,9 +1933,13 @@ detect this and will eventually quit sooner. */ be no conflict to access it, so no protection is needed. */ ulint autoinc_field_no; - /** The transaction that currently holds the the AUTOINC lock on this - table. Protected by lock_sys->mutex. */ - const trx_t *autoinc_trx; + /** The transaction that currently holds the the AUTOINC lock on this table. + Protected by lock_sys table shard latch. To "peek" the current value one + can read it without any latch, understanding that in general it may change. + Such access pattern is correct if trx thread wants to check if it has the lock + granted, as the field can only change to other value when lock is released, + which can not happen concurrently to thread executing the trx. */ + std::atomic autoinc_trx; /* @} */ @@ -1951,8 +1955,13 @@ detect this and will eventually quit sooner. */ /** Count of the number of record locks on this table. We use this to determine whether we can evict the table from the dictionary cache. - It is protected by lock_sys->mutex. */ - ulint n_rec_locks; + Writes (atomic increments and decrements) are performed when holding a shared + latch on lock_sys. (Note that this the table's shard latch is NOT required, + as this is field counts *record* locks, so a page shard is latched instead) + Reads should be performed when holding exclusive lock_sys latch, however: + - Some places assert this field is zero without holding any latch. + - Some places assert this field is positive holding only shared latch. */ + std::atomic n_rec_locks; #ifndef UNIV_DEBUG private: @@ -1964,7 +1973,7 @@ detect this and will eventually quit sooner. */ public: #ifndef UNIV_HOTBACKUP - /** List of locks on the table. Protected by lock_sys->mutex. */ + /** List of locks on the table. Protected by lock_sys shard latch. */ table_lock_list_t locks; /** count_by_mode[M] = number of locks in this->locks with lock->type_mode&LOCK_MODE_MASK == M. @@ -1972,12 +1981,12 @@ detect this and will eventually quit sooner. */ modes incompatible with LOCK_IS and LOCK_IX, to avoid costly iteration over this->locks when adding LOCK_IS or LOCK_IX. We use count_by_mode[LOCK_AUTO_INC] to track the number of granted and pending - autoinc locks on this table. This value is set after acquiring the - lock_sys_t::mutex but we peek the contents to determine whether other + autoinc locks on this table. This value is set after acquiring the lock_sys + table shard latch, but we peek the contents to determine whether other transactions have acquired the AUTOINC lock or not. Of course only one transaction can be granted the lock but there can be multiple waiters. - Protected by lock_sys->mutex. */ + Protected by lock_sys table shard latch. */ ulong count_by_mode[LOCK_NUM]; #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/lock0aarch64_atomic.h b/storage/innobase/include/lock0aarch64_atomic.h new file mode 100644 index 000000000000..a6bf245c6d59 --- /dev/null +++ b/storage/innobase/include/lock0aarch64_atomic.h @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 2020, Huawei Technologies Co., Ltd. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +*****************************************************************************/ + +#ifndef lock0aarch64_atomic_h +#define lock0aarch64_atomic_h + +#include "univ.i" + +lint word_add_fetch(volatile lint *word, ulint amount); + +#endif /* lock0aarch64_atomic_h */ \ No newline at end of file diff --git a/storage/innobase/include/lock0guards.h b/storage/innobase/include/lock0guards.h new file mode 100644 index 000000000000..407f713b6b5b --- /dev/null +++ b/storage/innobase/include/lock0guards.h @@ -0,0 +1,173 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef lock0guards_h +#define lock0guards_h + +#include "lock0lock.h" +#include "ut0class_life_cycle.h" + +namespace locksys { +/** +A RAII helper which latches global_latch in exclusive mode during constructor, +and unlatches it during destruction, preventing any other threads from activity +within lock_sys for it's entire scope. +*/ +class Global_exclusive_latch_guard : private ut::Non_copyable { + public: + Global_exclusive_latch_guard(); + ~Global_exclusive_latch_guard(); +}; + +/** +A RAII helper which tries to exclusively latch the global_lach in constructor +and unlatches it, if needed, during destruction, preventing any other threads +from activity within lock_sys for it's entire scope, if owns_lock(). +*/ +class Global_exclusive_try_latch : private ut::Non_copyable { + public: + Global_exclusive_try_latch(); + ~Global_exclusive_try_latch(); + /** Checks if succeeded to latch the global_latch during construction. + @return true iff the current thread owns (through this instance) the exclusive + global lock_sys latch */ + bool owns_lock() const noexcept { return m_owns_exclusive_global_latch; } + + private: + /** Did the constructor succeed to acquire exclusive global lock_sys latch? */ + bool m_owns_exclusive_global_latch; +}; + +/** +A RAII helper which latches global_latch in shared mode during constructor, +and unlatches it during destruction, preventing any other thread from acquiring +exclusive latch. This should be used in combination Shard_naked_latch_guard, +preferably by simply using Shard_latch_guard which combines the two for you. +*/ +class Global_shared_latch_guard : private ut::Non_copyable { + public: + Global_shared_latch_guard(); + ~Global_shared_latch_guard(); +}; + +/** +A RAII helper which latches the mutex protecting given shard during constructor, +and unlatches it during destruction. +You quite probably don't want to use this class, which only takes a shard's +latch, without acquiring global_latch - which gives no protection from threads +which latch only the global_latch exclusively to prevent any activity. +You should use it in combination with Global_shared_latch_guard, so that you +first obtain an s-latch on the global_latch, or simply use the Shard_latch_guard +class which already combines the two for you. +*/ +class Shard_naked_latch_guard : private ut::Non_copyable { + explicit Shard_naked_latch_guard(Lock_mutex &shard_mutex); + + public: + explicit Shard_naked_latch_guard(const dict_table_t &table); + + explicit Shard_naked_latch_guard(const page_id_t &page_id); + + ~Shard_naked_latch_guard(); + + private: + /** The mutex protecting the shard requested in constructor */ + Lock_mutex &m_shard_mutex; +}; + +/** +A RAII wrapper class which combines Global_shared_latch_guard and +Shard_naked_latch_guard to s-latch the global lock_sys latch and latch the mutex +protecting the specified shard for the duration of its scope. +The order of initialization is important: we have to take shared global latch +BEFORE we attempt to use hash function to compute correct shard and latch it. */ +class Shard_latch_guard { + Global_shared_latch_guard m_global_shared_latch_guard; + Shard_naked_latch_guard m_shard_naked_latch_guard; + + public: + explicit Shard_latch_guard(const dict_table_t &table) + : m_global_shared_latch_guard{}, m_shard_naked_latch_guard{table} {} + + explicit Shard_latch_guard(const page_id_t &page_id) + : m_global_shared_latch_guard{}, m_shard_naked_latch_guard{page_id} {} +}; + +/** +A RAII helper which latches the mutexes protecting specified shards for the +duration of its scope. +It makes sure to take the latches in correct order and handles the case where +both pages are in the same shard correctly. +You quite probably don't want to use this class, which only takes a shard's +latch, without acquiring global_latch - which gives no protection from threads +which latch only the global_latch exclusively to prevent any activity. +You should use it in combination with Global_shared_latch_guard, so that you +first obtain an s-latch on the global_latch, or simply use the +Shard_latches_guard class which already combines the two for you. +*/ +class Shard_naked_latches_guard { + explicit Shard_naked_latches_guard(Lock_mutex &shard_mutex_a, + Lock_mutex &shard_mutex_b); + + public: + explicit Shard_naked_latches_guard(const buf_block_t &block_a, + const buf_block_t &block_b); + + ~Shard_naked_latches_guard(); + + private: + /** The "smallest" of the two shards' mutexes in the latching order */ + Lock_mutex &m_shard_mutex_1; + /** The "largest" of the two shards' mutexes in the latching order */ + Lock_mutex &m_shard_mutex_2; + /** The ordering on shard mutexes used to avoid deadlocks */ + static constexpr std::less MUTEX_ORDER{}; +}; +/** +A RAII wrapper class which s-latches the global lock_sys shard, and mutexes +protecting specified shards for the duration of its scope. +It makes sure to take the latches in correct order and handles the case where +both pages are in the same shard correctly. +The order of initialization is important: we have to take shared global latch +BEFORE we attempt to use hash function to compute correct shard and latch it. +*/ +class Shard_latches_guard { + public: + explicit Shard_latches_guard(const buf_block_t &block_a, + const buf_block_t &block_b) + : m_global_shared_latch_guard{}, + m_shard_naked_latches_guard{block_a, block_b} {} + + ~Shard_latches_guard() {} + + private: + Global_shared_latch_guard m_global_shared_latch_guard; + Shard_naked_latches_guard m_shard_naked_latches_guard; +}; + +} // namespace locksys + +#endif /* lock0guards_h */ diff --git a/storage/innobase/include/lock0latches.h b/storage/innobase/include/lock0latches.h new file mode 100644 index 000000000000..0822679fed1f --- /dev/null +++ b/storage/innobase/include/lock0latches.h @@ -0,0 +1,299 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ +#ifndef lock0latches_h +#define lock0latches_h + +#include "sync0sharded_rw.h" +#include "ut0cpu_cache.h" +#include "ut0mutex.h" + +/* Forward declarations */ +struct dict_table_t; +class page_id_t; + +namespace locksys { +/** +The class which handles the logic of latching of lock_sys queues themselves. +The lock requests for table locks and record locks are stored in queues, and to +allow concurrent operations on these queues, we need a mechanism to latch these +queues in safe and quick fashion. +In the past we had a single latch which protected access to all of them. +Now, we use more granular approach. +In extreme, one could imagine protecting each queue with a separate latch. +To avoid having too many latch objects, and having to create and remove them on +demand, we use a more conservative approach. +The queues are grouped into a fixed number of shards, and each shard is +protected by its own mutex. + +However, there are several rare events in which we need to "stop the world" - +latch all queues, to prevent any activity inside lock-sys. +One way to accomplish this would be to simply latch all the shards one by one, +but it turns out to be way too slow in debug runs, where such "stop the world" +events are very frequent due to lock_sys validation. + +To allow for efficient latching of everything, we've introduced a global_latch, +which is a read-write latch. +Most of the time, we operate on one or two shards, in which case it is +sufficient to s-latch the global_latch and then latch shard's mutex. +For the "stop the world" operations, we x-latch the global_latch, which prevents +any other thread from latching any shard. + +However, it turned out that on ARM architecture, the default implementation of +read-write latch (rw_lock_t) is too slow because increments and decrements of +the number of s-latchers is implemented as read-update-try-to-write loop, which +means multiple threads try to modify the same cache line disrupting each other. +Therefore, we use a sharded version of read-write latch (Sharded_rw_lock), which +internally uses multiple instances of rw_lock_t, spreading the load over several +cache lines. Note that this sharding is a technical internal detail of the +global_latch, which for all other purposes can be treated as a single entity. + +This his how this conceptually looks like: +``` + [ global latch ] + | + v + [table shard 1] ... [table shard 512] [page shard 1] ... [page shard 512] + +``` + +So, for example access two queues for two records involves following steps: +1. s-latch the global_latch +2. identify the 2 pages to which the records belong +3. identify the lock_sys 2 hash buckets which contain the queues for given pages +4. identify the 2 shard ids which contain these two buckets +5. latch mutexes for the two shards in the order of their addresses + +All of the steps above (except 2, as we usually know the page already) are +accomplished with the help of single line: + + locksys::Shard_latches_guard guard{*block_a, *block_b}; + +And to "stop the world" one can simply x-latch the global latch by using: + + locksys::Global_exclusive_latch_guard guard{}; + +This class does not expose too many public functions, as the intention is to +rather use friend guard classes, like the Shard_latches_guard demonstrated. +*/ +class Latches { + private: + using Lock_mutex = ib_mutex_t; + + /** A helper wrapper around Shared_rw_lock which simplifies: + - lifecycle by providing constructor and destructor, and + - s-latching and s-unlatching by keeping track of the shard id used for + spreading the contention. + There must be at most one instance of this class (the one in the lock_sys), as + it uses thread_local-s to remember which shard of sharded rw lock was used by + this thread to perform s-latching (so, hypothetical other instances would + share this field, overwriting it and leading to errors). */ + class Unique_sharded_rw_lock { + /** The actual rw_lock implementation doing the heavy lifting */ + Sharded_rw_lock rw_lock; + + /** The value used for m_shard_id to indicate that current thread did not + s-latch any of the rw_lock's shards */ + static constexpr size_t NOT_IN_USE = std::numeric_limits::max(); + + /** The id of the rw_lock's shard which this thread has s-latched, or + NOT_IN_USE if it has not s-latched any*/ + static thread_local size_t m_shard_id; + + public: + Unique_sharded_rw_lock(); + ~Unique_sharded_rw_lock(); + bool try_x_lock() { return rw_lock.try_x_lock(); } + void x_lock() { rw_lock.x_lock(); } + void x_unlock() { rw_lock.x_unlock(); } + void s_lock() { + ut_ad(m_shard_id == NOT_IN_USE); + m_shard_id = rw_lock.s_lock(); + } + void s_unlock() { + ut_ad(m_shard_id != NOT_IN_USE); + rw_lock.s_unlock(m_shard_id); + m_shard_id = NOT_IN_USE; + } +#ifdef UNIV_DEBUG + bool x_own() const { return rw_lock.x_own(); } + bool s_own() const { + return m_shard_id != NOT_IN_USE && rw_lock.s_own(m_shard_id); + } +#endif + }; + + using Padded_mutex = ut::Cacheline_padded; + + /** Number of page shards, and also number of table shards. + Must be a power of two */ + static constexpr size_t SHARDS_COUNT = 512; + + /* + Functions related to sharding by page (containing records to lock). + + This must be done in such a way that two pages which share a single lock + queue fall into the same shard. We accomplish this by reusing hash function + used to determine lock queue, and then group multiple queues into single + shard. + */ + class Page_shards { + /** Each shard is protected by a separate mutex. Mutexes are padded to avoid + false sharing issues with cache. */ + Padded_mutex mutexes[SHARDS_COUNT]; + /** + Identifies the page shard which contains record locks for records from the + given page. + @param[in] page_id The space_id and page_no of the page + @return Integer in the range [0..lock_sys_t::SHARDS_COUNT) + */ + static size_t get_shard(const page_id_t &page_id); + + public: + Page_shards(); + ~Page_shards(); + + /** + Returns the mutex which (together with the global latch) protects the page + shard which contains record locks for records from the given page. + @param[in] page_id The space_id and page_no of the page + @return The mutex responsible for the shard containing the page + */ + const Lock_mutex &get_mutex(const page_id_t &page_id) const; + + /** + Returns the mutex which (together with the global latch) protects the page + shard which contains record locks for records from the given page. + @param[in] page_id The space_id and page_no of the page + @return The mutex responsible for the shard containing the page + */ + Lock_mutex &get_mutex(const page_id_t &page_id); + }; + + /* + Functions related to sharding by table + + We identify tables by their id. Each table has its own lock queue, so we + simply group several such queues into single shard. + */ + class Table_shards { + /** Each shard is protected by a separate mutex. Mutexes are padded to avoid + false sharing issues with cache. */ + Padded_mutex mutexes[SHARDS_COUNT]; + /** + Identifies the table shard which contains locks for the given table. + @param[in] table The table + @return Integer in the range [0..lock_sys_t::SHARDS_COUNT) + */ + static size_t get_shard(const dict_table_t &table); + + public: + Table_shards(); + ~Table_shards(); + + /** Returns the mutex which (together with the global latch) protects the + table shard which contains table locks for the given table. + @param[in] table The table + @return The mutex responsible for the shard containing the table + */ + Lock_mutex &get_mutex(const dict_table_t &table); + + /** Returns the mutex which (together with the global latch) protects the + table shard which contains table locks for the given table. + @param[in] table The table + @return The mutex responsible for the shard containing the table + */ + const Lock_mutex &get_mutex(const dict_table_t &table) const; + }; + + /** padding to prevent other memory update hotspots from residing on the same + memory cache line */ + char pad1[ut::INNODB_CACHE_LINE_SIZE] = {}; + + Unique_sharded_rw_lock global_latch; + + Page_shards page_shards; + + Table_shards table_shards; + + public: + /* You should use following RAII guards to modify the state of Latches. */ + friend class Global_exclusive_latch_guard; + friend class Global_exclusive_try_latch; + friend class Global_shared_latch_guard; + friend class Shard_naked_latch_guard; + friend class Shard_naked_latches_guard; + + /** You should not use this functionality in new code. + Instead use Global_exclusive_latch_guard. + This is intended only to be use within lock0* module, thus this class is only + accessible through lock0priv.h. + It is only used by lock_rec_fetch_page() as a workaround. */ + friend class Unsafe_global_latch_manipulator; + + Latches() = default; + ~Latches() = default; + +#ifdef UNIV_DEBUG + /** + Tests if lock_sys latch is exclusively owned by the current thread. + @return true iff the current thread owns exclusive global lock_sys latch + */ + bool owns_exclusive_global_latch() const { return global_latch.x_own(); } + + /** + Tests if lock_sys latch is owned in shared mode by the current thread. + @return true iff the current thread owns shared global lock_sys latch + */ + bool owns_shared_global_latch() const { return global_latch.s_own(); } + + /** + Tests if given page shard can be safely accessed by the current thread. + @param[in] page_id The space_id and page_no of the page + @return true iff the current thread owns exclusive global lock_sys latch or + both a shared global lock_sys latch and mutex protecting the page shard + */ + bool owns_page_shard(const page_id_t &page_id) const { + return owns_exclusive_global_latch() || + (page_shards.get_mutex(page_id).is_owned() && + owns_shared_global_latch()); + } + + /** + Tests if given table shard can be safely accessed by the current thread. + @param table the table + @return true iff the current thread owns exclusive global lock_sys latch or + both a shared global lock_sys latch and mutex protecting the table shard + */ + bool owns_table_shard(const dict_table_t &table) const { + return owns_exclusive_global_latch() || + (table_shards.get_mutex(table).is_owned() && + owns_shared_global_latch()); + } +#endif /* UNIV_DEBUG */ +}; +} // namespace locksys + +#endif /* lock0latches_h */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 08a02d35b204..ba98ea8c0a63 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -47,6 +47,7 @@ this program; if not, write to the Free Software Foundation, Inc., #ifndef UNIV_HOTBACKUP #include "gis0rtree.h" #endif /* UNIV_HOTBACKUP */ +#include "lock0latches.h" #include "lock0prdt.h" /** @@ -389,9 +390,10 @@ void lock_rec_restore_from_page_infimum( the infimum */ /** Determines if there are explicit record locks on a page. - @return an explicit record lock on the page, or NULL if there are none */ -lock_t *lock_rec_expl_exist_on_page(space_id_t space, /*!< in: space id */ - page_no_t page_no) /*!< in: page number */ +@param[in] space space id +@param[in] page_no page number +@return true iff an explicit record lock on the page exists */ +bool lock_rec_expl_exist_on_page(space_id_t space, page_no_t page_no) MY_ATTRIBUTE((warn_unused_result)); /** Checks if locks of other transactions prevent an immediate insert of a record. If they do, first tests if the query thread should anyway @@ -608,7 +610,7 @@ void lock_rec_unlock( TRX_STATE_COMMITTED_IN_MEMORY. */ void lock_trx_release_locks(trx_t *trx); /*!< in/out: transaction */ -/** Release read locks of a transacion. It is called during XA +/** Release read locks of a transaction. It is called during XA prepare to release locks early. @param[in,out] trx transaction @param[in] only_gap release only GAP locks */ @@ -688,12 +690,8 @@ void lock_report_trx_id_insanity( trx_id_t max_trx_id); /*!< in: trx_sys_get_max_trx_id() */ /** Prints info of locks for all transactions. -@return false if not able to obtain lock mutex and exits without -printing info */ -bool lock_print_info_summary( - FILE *file, /*!< in: file where to print */ - ibool nowait) /*!< in: whether to wait for the lock mutex */ - MY_ATTRIBUTE((warn_unused_result)); +@param[in] file file where to print */ +void lock_print_info_summary(FILE *file); /** Prints transaction lock wait and MVCC state. @param[in,out] file file where to print @@ -701,16 +699,18 @@ bool lock_print_info_summary( void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx); /** Prints info of locks for each transaction. This function assumes that the - caller holds the lock mutex and more importantly it will release the lock - mutex on behalf of the caller. (This should be fixed in the future). */ -void lock_print_info_all_transactions( - FILE *file); /*!< in: file where to print */ +caller holds the exclusive global latch and more importantly it may release and +reacquire it on behalf of the caller. (This should be fixed in the future). +@param[in,out] file the file where to print */ +void lock_print_info_all_transactions(FILE *file); + /** Return approximate number or record locks (bits set in the bitmap) for this transaction. Since delete-marked records may be removed, the record count will not be precise. - The caller must be holding lock_sys->mutex. */ -ulint lock_number_of_rows_locked( - const trx_lock_t *trx_lock) /*!< in: transaction locks */ + The caller must be holding exclusive global lock_sys latch. + @param[in] trx_lock transaction locks + */ +ulint lock_number_of_rows_locked(const trx_lock_t *trx_lock) MY_ATTRIBUTE((warn_unused_result)); /** Return the number of table locks for a transaction. @@ -795,12 +795,13 @@ space_id_t lock_rec_get_space_id(const lock_t *lock); /*!< in: lock */ /** For a record lock, gets the page number on which the lock is. @return page number */ page_no_t lock_rec_get_page_no(const lock_t *lock); /*!< in: lock */ + /** Check if there are any locks (table or rec) against table. - @return true if locks exist */ -bool lock_table_has_locks( - const dict_table_t *table); /*!< in: check if there are any locks - held on records in this table or on the - table itself */ +Returned value might be obsolete. +@param[in] table the table +@return true if there were any locks held on records in this table or on the +table itself at some point in time during the call */ +bool lock_table_has_locks(const dict_table_t *table); /** A thread which wakes up threads whose lock wait may have lasted too long. */ void lock_wait_timeout_thread(); @@ -841,6 +842,7 @@ bool lock_check_trx_id_sanity( const dict_index_t *index, /*!< in: index */ const ulint *offsets) /*!< in: rec_get_offsets(rec, index) */ MY_ATTRIBUTE((warn_unused_result)); + /** Check if the transaction holds an exclusive lock on a record. @param[in] thr query thread of the transaction @param[in] table table to check @@ -850,6 +852,10 @@ bool lock_check_trx_id_sanity( bool lock_trx_has_rec_x_lock(que_thr_t *thr, const dict_table_t *table, const buf_block_t *block, ulint heap_no) MY_ATTRIBUTE((warn_unused_result)); + +/** Validates the lock system. + @return true if ok */ +bool lock_validate(); #endif /* UNIV_DEBUG */ /** @@ -930,52 +936,51 @@ struct lock_op_t { lock_mode mode; /*!< lock mode */ }; -typedef ib_mutex_t LockMutex; +typedef ib_mutex_t Lock_mutex; /** The lock system struct */ struct lock_sys_t { - char pad1[INNOBASE_CACHE_LINE_SIZE]; - /*!< padding to prevent other - memory update hotspots from - residing on the same memory - cache line */ - LockMutex mutex; /*!< Mutex protecting the - locks */ - hash_table_t *rec_hash; /*!< hash table of the record - locks */ - hash_table_t *prdt_hash; /*!< hash table of the predicate - lock */ - hash_table_t *prdt_page_hash; /*!< hash table of the page - lock */ - - char pad2[INNOBASE_CACHE_LINE_SIZE]; /*!< Padding */ - LockMutex wait_mutex; /*!< Mutex protecting the - next two fields */ - srv_slot_t *waiting_threads; /*!< Array of user threads - suspended while waiting for - locks within InnoDB, protected - by the lock_sys->wait_mutex */ - srv_slot_t *last_slot; /*!< highest slot ever used - in the waiting_threads array, - protected by - lock_sys->wait_mutex */ - - ibool rollback_complete; - /*!< TRUE if rollback of all - recovered transactions is - complete. Protected by - lock_sys->mutex */ - - ulint n_lock_max_wait_time; /*!< Max wait time */ - - os_event_t timeout_event; /*!< Set to the event that is - created in the lock wait monitor - thread. A value of 0 means the - thread is not active */ + /** The latches protecting queues of record and table locks */ + locksys::Latches latches; + + /** The hash table of the record (LOCK_REC) locks, except for predicate + (LOCK_PREDICATE) and predicate page (LOCK_PRDT_PAGE) locks */ + hash_table_t *rec_hash; + + /** The hash table of predicate (LOCK_PREDICATE) locks */ + hash_table_t *prdt_hash; + + /** The hash table of the predicate page (LOCK_PRD_PAGE) locks */ + hash_table_t *prdt_page_hash; + + /** Padding to avoid false sharing of wait_mutex field */ + char pad2[ut::INNODB_CACHE_LINE_SIZE]; + + /** The mutex protecting the next two fields */ + Lock_mutex wait_mutex; + + /** Array of user threads suspended while waiting for locks within InnoDB. + Protected by the lock_sys->wait_mutex. */ + srv_slot_t *waiting_threads; + + /** The highest slot ever used in the waiting_threads array. + Protected by lock_sys->wait_mutex. */ + srv_slot_t *last_slot; + + /** TRUE if rollback of all recovered transactions is complete. + Protected by exclusive global lock_sys latch. */ + bool rollback_complete; + + /** Max lock wait time observed, for innodb_row_lock_time_max reporting. */ + ulint n_lock_max_wait_time; + + /** Set to the event that is created in the lock wait monitor thread. A value + of 0 means the thread is not active */ + os_event_t timeout_event; #ifdef UNIV_DEBUG - /** Lock timestamp counter */ - uint64_t m_seq; + /** Lock timestamp counter, used to assign lock->m_seq on creation. */ + std::atomic m_seq; #endif /* UNIV_DEBUG */ }; @@ -1023,27 +1028,12 @@ void lock_rec_trx_wait(lock_t *lock, ulint i, ulint type); /** The lock system */ extern lock_sys_t *lock_sys; -/** Test if lock_sys->mutex can be acquired without waiting. */ -#define lock_mutex_enter_nowait() (lock_sys->mutex.trylock(__FILE__, __LINE__)) - -/** Test if lock_sys->mutex is owned by the current thread. */ -#define lock_mutex_own() (lock_sys->mutex.is_owned()) - -/** Acquire the lock_sys->mutex. */ -#define lock_mutex_enter() \ - do { \ - mutex_enter(&lock_sys->mutex); \ - } while (0) - -/** Release the lock_sys->mutex. */ -#define lock_mutex_exit() \ - do { \ - lock_sys->mutex.exit(); \ - } while (0) - +#ifdef UNIV_DEBUG /** Test if lock_sys->wait_mutex is owned. */ #define lock_wait_mutex_own() (lock_sys->wait_mutex.is_owned()) +#endif + /** Acquire the lock_sys->wait_mutex. */ #define lock_wait_mutex_enter() \ do { \ @@ -1058,4 +1048,51 @@ extern lock_sys_t *lock_sys; #include "lock0lock.ic" +namespace locksys { + +/* OWNERSHIP TESTS */ +#ifdef UNIV_DEBUG + +/** +Tests if lock_sys latch is exclusively owned by the current thread. +@return true iff the current thread owns exclusive global lock_sys latch +*/ +bool owns_exclusive_global_latch(); + +/** +Tests if lock_sys latch is owned in shared mode by the current thread. +@return true iff the current thread owns shared global lock_sys latch +*/ +bool owns_shared_global_latch(); + +/** +Tests if given page shard can be safely accessed by the current thread. +@param page_id specifies the page +@return true iff the current thread owns exclusive global lock_sys latch or both +a shared global lock_sys latch and mutex protecting the page shard +*/ +bool owns_page_shard(const page_id_t &page_id); + +/** +Test if given table shard can be safely accessed by the current thread. +@param table the table +@return true iff the current thread owns exclusive global lock_sys latch or both + a shared global lock_sys latch and mutex protecting the table shard +*/ +bool owns_table_shard(const dict_table_t &table); + +/** Checks if shard which contains lock is latched (or that an exclusive latch +on whole lock_sys is held) by current thread +@param[in] lock lock which belongs to a shard we want to check +@return true iff the current thread owns exclusive global lock_sys latch or both + a shared global lock_sys latch and mutex protecting the shard containing + the specified lock */ +bool owns_lock_shard(const lock_t *lock); + +#endif /* UNIV_DEBUG */ + +} // namespace locksys + +#include "lock0guards.h" + #endif diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic index 5051a49d6dd5..7826c662b6fc 100644 --- a/storage/innobase/include/lock0lock.ic +++ b/storage/innobase/include/lock0lock.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h index 318b72c7a8ed..cb2903b0858a 100644 --- a/storage/innobase/include/lock0priv.h +++ b/storage/innobase/include/lock0priv.h @@ -86,6 +86,7 @@ struct lock_rec_t { placed immediately after the lock struct */ + page_id_t get_page_id() const { return page_id_t(space, page_no); } /** Print the record lock into the given output stream @param[in,out] out the output stream @return the given output stream. */ @@ -128,7 +129,7 @@ bool lock_mode_is_next_key_lock(ulint mode) { UNIV_INLINE bool lock_rec_get_nth_bit(const lock_t *lock, ulint i); -/** Lock struct; protected by lock_sys->mutex */ +/** Lock struct; protected by lock_sys latches */ struct lock_t { /** transaction owning the lock */ trx_t *trx; @@ -612,26 +613,19 @@ struct RecID { @param[in] lock Record lock @param[in] heap_no Heap number in the page */ RecID(const lock_t *lock, ulint heap_no) - : m_space_id(lock->rec_lock.space), - m_page_no(lock->rec_lock.page_no), - m_heap_no(static_cast(heap_no)), - m_fold(lock_rec_fold(m_space_id, m_page_no)) { - ut_ad(m_space_id < UINT32_MAX); - ut_ad(m_page_no < UINT32_MAX); - ut_ad(m_heap_no < UINT32_MAX); + : RecID(lock->rec_lock.get_page_id(), heap_no) { + ut_ad(lock->is_record_lock()); } /** Constructor - @param[in] space_id Tablespace ID - @param[in] page_no Page number in space_id - @param[in] heap_no Heap number in */ - RecID(space_id_t space_id, page_no_t page_no, ulint heap_no) - : m_space_id(space_id), - m_page_no(page_no), - m_heap_no(static_cast(heap_no)), - m_fold(lock_rec_fold(m_space_id, m_page_no)) { - ut_ad(m_space_id < UINT32_MAX); - ut_ad(m_page_no < UINT32_MAX); + @param[in] page_id Tablespace ID and page number within space + @param[in] heap_no Heap number in the page */ + RecID(page_id_t page_id, uint32_t heap_no) + : m_page_id(page_id), + m_heap_no(heap_no), + m_fold(lock_rec_fold(page_id.space(), page_id.page_no())) { + ut_ad(m_page_id.space() < UINT32_MAX); + ut_ad(m_page_id.page_no() < UINT32_MAX); ut_ad(m_heap_no < UINT32_MAX); } @@ -639,12 +633,7 @@ struct RecID { @param[in] block Block in a tablespace @param[in] heap_no Heap number in the block */ RecID(const buf_block_t *block, ulint heap_no) - : m_space_id(block->page.id.space()), - m_page_no(block->page.id.page_no()), - m_heap_no(static_cast(heap_no)), - m_fold(lock_rec_fold(m_space_id, m_page_no)) { - ut_ad(heap_no < UINT32_MAX); - } + : RecID(block->get_page_id(), heap_no) {} /** @return the "folded" value of {space, page_no} */ @@ -658,13 +647,10 @@ struct RecID { @return true if matches the lock. */ inline bool matches(const lock_t *lock) const; - /** - Tablespace ID */ - space_id_t m_space_id; + const page_id_t &get_page_id() const { return m_page_id; } - /** - Page number within the space ID */ - page_no_t m_page_no; + /** Tablespace ID and page number within space */ + page_id_t m_page_id; /** Heap number within the page */ @@ -801,7 +787,7 @@ class RecLock { /** Setup the context from the requirements */ void init(const page_t *page) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(m_rec_id.get_page_id())); ut_ad(!srv_read_only_mode); ut_ad(m_index->is_clustered() || !dict_index_is_online_ddl(m_index)); ut_ad(m_thr == nullptr || m_trx == thr_get_trx(m_thr)); @@ -1070,7 +1056,7 @@ struct Lock_iter { @param[in] lock The current lock @return matching lock or nullptr if end of list */ static lock_t *advance(const RecID &rec_id, lock_t *lock) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); ut_ad(lock->is_record_lock()); while ((lock = static_cast(lock->hash)) != nullptr) { @@ -1090,7 +1076,7 @@ struct Lock_iter { @param[in] rec_id Record ID @return first lock, nullptr if none exists */ static lock_t *first(hash_cell_t *list, const RecID &rec_id) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); auto lock = static_cast(list->node); @@ -1111,7 +1097,7 @@ struct Lock_iter { template static const lock_t *for_each(const RecID &rec_id, F &&f, hash_table_t *hash_table = lock_sys->rec_hash) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); auto list = hash_get_nth_cell(hash_table, hash_calc_hash(rec_id.m_fold, hash_table)); @@ -1129,4 +1115,12 @@ struct Lock_iter { } }; +namespace locksys { +class Unsafe_global_latch_manipulator { + public: + static void exclusive_unlatch() { lock_sys->latches.global_latch.x_unlock(); } + static void exclusive_latch() { lock_sys->latches.global_latch.x_lock(); } +}; +} // namespace locksys + #endif /* lock0priv_h */ diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic index 5d95c90a5d44..a9eb6270c8f0 100644 --- a/storage/innobase/include/lock0priv.ic +++ b/storage/innobase/include/lock0priv.ic @@ -109,7 +109,7 @@ lock_t *lock_rec_get_first_on_page_addr( space_id_t space, /*!< in: space */ page_no_t page_no) /*!< in: page number */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(page_id_t{space, page_no})); for (lock_t *lock = static_cast( HASH_GET_FIRST(lock_hash, lock_rec_hash(space, page_no))); @@ -131,7 +131,7 @@ lock_t *lock_rec_get_first_on_page( hash_table_t *lock_hash, /*!< in: lock hash table */ const buf_block_t *block) /*!< in: buffer block */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); space_id_t space = block->page.id.space(); page_no_t page_no = block->page.id.page_no(); @@ -155,7 +155,7 @@ UNIV_INLINE lock_t *lock_rec_get_next(ulint heap_no, /*!< in: heap number of the record */ lock_t *lock) /*!< in: lock */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); do { ut_ad(lock_get_type_low(lock) == LOCK_REC); @@ -184,7 +184,7 @@ lock_t *lock_rec_get_first( const buf_block_t *block, /*!< in: block containing the record */ ulint heap_no) /*!< in: heap number of the record */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); for (lock_t *lock = lock_rec_get_first_on_page(hash, block); lock; lock = lock_rec_get_next_on_page(lock)) { @@ -222,15 +222,13 @@ UNIV_INLINE const lock_t *lock_rec_get_next_on_page_const( const lock_t *lock) /*!< in: a record lock */ { - ut_ad(lock_mutex_own()); ut_ad(lock_get_type_low(lock) == LOCK_REC); - - space_id_t space = lock->space_id(); - page_no_t page_no = lock->page_no(); + const auto page_id = lock->rec_lock.get_page_id(); + ut_ad(locksys::owns_page_shard(page_id)); while ((lock = static_cast(HASH_GET_NEXT(hash, lock))) != nullptr) { - if (lock->space_id() == space && lock->page_no() == page_no) { + if (page_id.equals_to(lock->rec_lock.get_page_id())) { return (lock); } } @@ -283,10 +281,9 @@ ulint lock_get_wait(const lock_t *lock) /*!< in: lock */ UNIV_INLINE void lock_reset_lock_and_trx_wait(lock_t *lock) /*!< in/out: record lock */ { - ut_ad(lock->trx->lock.wait_lock == lock); + ut_ad(locksys::owns_lock_shard(lock)); ut_ad(lock_get_wait(lock)); - ut_ad(lock_mutex_own()); - + ut_ad(lock->trx->lock.wait_lock == lock); /* We intentionally don't clear trx->lock.blocking_trx here, as lock_reset_lock_and_trx_wait() is called also during movements of locks from one page to another, which does not really change the structure of the @@ -294,6 +291,7 @@ void lock_reset_lock_and_trx_wait(lock_t *lock) /*!< in/out: record lock */ is responsible for clearing the blocking_trx field once it is sure that we really want to remove the edge from the wait-for graph.*/ lock->trx->lock.wait_lock = nullptr; + /* We intentionally don't clear lock->trx->lock.wait_lock_type here, to make it easier to obtain stats about the last wait in lock_wait_suspend_thread(). @see trx_lock_t::wait_lock_type for more detailed explanation. */ @@ -346,8 +344,7 @@ bool lock_table_has(const trx_t *trx, const dict_table_t *table, @param[i] lock Lock to compare with @return true if matches the lock. */ bool RecID::matches(const lock_t *lock) const { - return (lock->rec_lock.space == m_space_id && - lock->rec_lock.page_no == m_page_no && + return (lock->rec_lock.get_page_id().equals_to(get_page_id()) && lock_rec_get_nth_bit(lock, m_heap_no)); } diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h index a9e252c25776..544d15dca28e 100644 --- a/storage/innobase/include/log0types.h +++ b/storage/innobase/include/log0types.h @@ -133,7 +133,7 @@ struct Log_handle { /** Redo log - single data structure with state of the redo log system. In future, one could consider splitting this to multiple data structures. */ -struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { +struct alignas(ut::INNODB_CACHE_LINE_SIZE) log_t { /**************************************************/ /** @name Users writing to log buffer @@ -148,7 +148,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { Log archiver (Clone plugin) acquires x-lock. */ mutable Sharded_rw_lock sn_lock; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Current sn value. Used to reserve space in the redo log, and used to acquire an exclusive access to the log buffer. @@ -160,7 +160,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { /** Padding after the _sn to avoid false sharing issues for constants below (due to changes of sn). */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Pointer to the log buffer, aligned up to OS_FILE_LOG_BLOCK_SIZE. The alignment is to ensure that buffer parts specified for file IO write @@ -177,19 +177,19 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { that is including bytes for headers and footers of log blocks. */ size_t buf_size; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** The recent written buffer. Protected by: sn_lock or writer_mutex. */ Link_buf recent_written; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** The recent closed buffer. Protected by: sn_lock or closer_mutex. */ Link_buf recent_closed; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -208,14 +208,14 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { Protected by: writer_mutex (writes). */ atomic_sn_t buf_limit_sn; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Up to this lsn, data has been written to disk (fsync not required). Protected by: writer_mutex (writes). @see @ref subsect_redo_log_write_lsn */ atomic_lsn_t write_lsn; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Unaligned pointer to array with events, which are used for notifications sent from the log write notifier thread to user threads. @@ -231,17 +231,17 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { size_t write_events_size; /** Approx. number of requests to write/flush redo since startup. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) std::atomic write_to_file_requests_total; /** How often redo write/flush is requested in average. Measures in microseconds. Log threads do not spin when the write/flush requests are not frequent. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) std::atomic write_to_file_requests_interval; /** This padding is probably not needed, left for convenience. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -268,13 +268,13 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { size_t flush_events_size; /** Padding before the frequently updated flushed_to_disk_lsn. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Up to this lsn data has been flushed to disk (fsynced). */ atomic_lsn_t flushed_to_disk_lsn; /** Padding after the frequently updated flushed_to_disk_lsn. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -299,13 +299,13 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { /** Mutex which can be used to pause log flusher thread. */ mutable ib_mutex_t flusher_mutex; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) os_event_t flusher_event; /** Padding to avoid any dependency between the log flusher and the log writer threads. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -387,13 +387,13 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { /** Mutex which can be used to pause log writer thread. */ mutable ib_mutex_t writer_mutex; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) os_event_t writer_event; /** Padding after section for the log writer thread, to avoid any dependency between the log writer and the log closer threads. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -413,7 +413,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { /** Padding after the log closer thread and before the memory used for communication between the log flusher and notifier threads. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -435,7 +435,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { mutable ib_mutex_t flush_notifier_mutex; /** Padding. */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -450,14 +450,14 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { /** Mutex which can be used to pause log write notifier thread. */ mutable ib_mutex_t write_notifier_mutex; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** Event used by the log writer thread to notify the log write notifier thread, that it should proceed with notifying user threads waiting for the advanced write_lsn (because it has been advanced). */ os_event_t write_notifier_event; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -515,7 +515,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { #endif /* UNIV_DEBUG */ - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ @@ -601,7 +601,7 @@ struct alignas(INNOBASE_CACHE_LINE_SIZE) log_t { Protected by (updates only): limits_mutex. */ atomic_sn_t dict_persist_margin; - alignas(INNOBASE_CACHE_LINE_SIZE) + alignas(ut::INNODB_CACHE_LINE_SIZE) /** @} */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index f74f18ba8840..946d7803a9d6 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -91,7 +91,7 @@ struct Block { byte *m_ptr; - byte pad[INNOBASE_CACHE_LINE_SIZE - sizeof(ulint)]; + byte pad[ut::INNODB_CACHE_LINE_SIZE - sizeof(ulint)]; lock_word_t m_in_use; }; } // namespace file diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index 7ee7356ad46d..81a4f3c0a617 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -86,10 +86,12 @@ void que_graph_free(que_t *graph); /*!< in: query graph; we assume that the que_graph_free_recursive and free the heap afterwards! */ /** Stops a query thread if graph or trx is in a state requiring it. The - conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex - has to be reserved. + conditions are tested in the order (1) graph, (2) trx. + Caller must hold the trx mutex. + @param[in,out] thr query thread @return true if stopped */ -ibool que_thr_stop(que_thr_t *thr); /*!< in: query thread */ +bool que_thr_stop(que_thr_t *thr); + /** Moves a thread from another state to the QUE_THR_RUNNING state. Increments the n_active_thrs counters of the query graph and transaction. */ void que_thr_move_to_run_state_for_mysql( @@ -272,9 +274,11 @@ struct que_thr_t { /** The thread slot in the lock_sys->waiting_threads array protected by lock_sys->wait_mutex when writing to it, and also by trx->mutex when changing from null to non-null. - While reading, one either hold the lock_sys->wait_mutex, or hold the - lock_sys->mutex, trx->mutex and a proof that noone else has woken the trx yet, - so the slot is either null, or changing to non-null, but definitely not + While reading, one can either hold the lock_sys->wait_mutex, or hold the + trx->mutex and a proof that no one has woken the trx yet, + so the slot is either still null (if trx hadn't yet started the sleep), or + already non-null (if it already started sleep), but definitely not + changing from null to non-null (as it requires trx->mutex) nor changing from non-null to null (as it happens after wake up). */ struct srv_slot_t *slot; /*------------------------------*/ diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 61ad27f7e921..bccba812ba3f 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -44,6 +44,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "rem/rec.h" #include "rem0types.h" #include "trx0types.h" +#include "ut0class_life_cycle.h" /** The following function is used to get the pointer of the next chained record on the same page. @@ -439,6 +440,81 @@ ulint rec_get_data_size_old(const rec_t *rec) /*!< in: physical record */ MY_ATTRIBUTE((warn_unused_result)); #define rec_offs_init(offsets) \ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) + +/** +A helper RAII wrapper for otherwise difficult to use sequence of: + + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + mem_heap_t *heap = nullptr; + + const ulint *offsets = + rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap); + + DO_SOMETHING(offsets); + + if (heap != nullptr) { + mem_heap_free(heap); + } + +With this helper you can simply do: + + DO_SOMETHING(Rec_offsets().compute(rec,index)); + +And if you need to reuse the memory allocated offsets several times you can: + Rec_offsets offsets; + for(rec: recs) DO_SOMTHING(offsets.compute(rec,index)) +*/ +class Rec_offsets : private ut::Non_copyable { + public: + /** Prepares offsets to initially point to the fixed-size buffer, and marks + the memory as allocated, but uninitialized. You first need to call compute() + to use it */ + Rec_offsets() { rec_offs_init(m_preallocated_buffer); } + + /** Computes offsets for given record. Returned array is owned by this + instance. You can use its value as long as this object does not go out of + scope (which can free the buffer), and you don't call compute() again (which + can overwrite the offsets). + @param[in] rec The record for which you want to compute the offsets + @param[in] index The index which contains the record + @return A pointer to offsets array owned by this instance. Valid till next + call to compute() or end of this instance lifetime. + */ + const ulint *compute(const rec_t *rec, const dict_index_t *index) { + m_offsets = + rec_get_offsets(rec, index, m_offsets, ULINT_UNDEFINED, &m_heap); + return m_offsets; + } + /** Deallocated dynamically allocated memory, if any. */ + ~Rec_offsets() { + if (m_heap) { + mem_heap_free(m_heap); + m_heap = nullptr; + } + } + + private: + /** Pointer to heap used by rec_get_offsets(). Initially nullptr. If row is + really big, rec_get_offsets() may need to allocate new buffer for offsets. + At, first, when heap is null, rec_get_offsets() will create new heap, and pass + it back via reference. On subsequent calls, we will pass this heap, so it + is reused if needed. Therefore all allocated buffers are in this heap, if it + is not nullptr */ + mem_heap_t *m_heap{nullptr}; + + /** Buffer with size large enough to handle common cases without having to use + heap. This is the initial value of m_offsets.*/ + ulint m_preallocated_buffer[REC_OFFS_NORMAL_SIZE]; + + /* Initially points to m_preallocated_buffer (which is uninitialized memory). + After each call to compute() contains the pointer to the most recently + computed offsets. + We pass it back to rec_get_offsets() on subsequent calls to compute() to reuse + the same memory if possible. */ + ulint *m_offsets{m_preallocated_buffer}; +}; + /** The following function returns the data size of a physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index e8bf13693b6e..b6cc006b126c 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -48,14 +48,17 @@ class ReadView; /** Finds out if an active transaction has inserted or modified a secondary index record. + @param[in] rec record in a secondary index + @param[in] index the secondary index + @param[in] offsets rec_get_offsets(rec, index) @return 0 if committed, else the active transaction id; NOTE that this function can return false positives but never false - negatives. The caller must confirm all positive results by calling - trx_is_active() while holding lock_sys->mutex. */ -trx_t *row_vers_impl_x_locked( - const rec_t *rec, /*!< in: record in a secondary index */ - const dict_index_t *index, /*!< in: the secondary index */ - const ulint *offsets); /*!< in: rec_get_offsets(rec, index) */ + negatives. The caller must confirm all positive results by checking if the trx + is still active. +*/ +trx_t *row_vers_impl_x_locked(const rec_t *rec, const dict_index_t *index, + const ulint *offsets); + /** Finds out if we must preserve a delete marked earlier version of a clustered index record, because it is >= the purge view. @param[in] trx_id transaction id in the version diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 5966f131b6d9..3b2ceacebdbe 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -950,16 +950,16 @@ void srv_active_wake_master_thread_low(void); void srv_wake_master_thread(void); #ifndef UNIV_HOTBACKUP /** Outputs to a file the output of the InnoDB Monitor. - @return false if not all information printed - due to failure to obtain necessary mutex */ -ibool srv_printf_innodb_monitor( - FILE *file, /*!< in: output stream */ - ibool nowait, /*!< in: whether to wait for the - lock_sys_t::mutex */ - ulint *trx_start, /*!< out: file position of the start of - the list of active transactions */ - ulint *trx_end); /*!< out: file position of the end of - the list of active transactions */ +@param[in] file output stream +@param[in] nowait whether to wait for the exclusive global lock_sys latch +@param[out] trx_start file position of the start of the list of active + transactions +@param[out] trx_end file position of the end of the list of active + transactions +@return false if not all information printed due to failure to obtain necessary + mutex */ +bool srv_printf_innodb_monitor(FILE *file, bool nowait, ulint *trx_start, + ulint *trx_end); /** Function to pass InnoDB status variables to MySQL */ void srv_export_innodb_status(void); diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index 2380841f26a1..2af0993e0ea7 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -38,6 +38,7 @@ this program; if not, write to the Free Software Foundation, Inc., *******************************************************/ #include "os0event.h" +#include "lock0aarch64_atomic.h" /** Lock an rw-lock in shared mode for the current thread. If the rw-lock is locked in exclusive mode, or there is an exclusive lock request waiting, @@ -259,7 +260,7 @@ lint rw_lock_lock_word_incr(rw_lock_t *lock, /*!< in/out: rw-lock */ ulint amount) /*!< in: amount of increment */ { #ifdef INNODB_RW_LOCKS_USE_ATOMICS - return (os_atomic_increment_lint(&lock->lock_word, amount)); + return word_add_fetch(&lock->lock_word, amount); #else /* INNODB_RW_LOCKS_USE_ATOMICS */ lint local_lock_word; diff --git a/storage/innobase/include/sync0sharded_rw.h b/storage/innobase/include/sync0sharded_rw.h index c53850d1a3b5..0431fc7ed747 100644 --- a/storage/innobase/include/sync0sharded_rw.h +++ b/storage/innobase/include/sync0sharded_rw.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2017, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -44,6 +44,7 @@ the file COPYING.Google. #define sync0sharded_rw_h #include "sync0rw.h" +#include "ut0cpu_cache.h" #include "ut0rnd.h" #include "ut0ut.h" @@ -84,18 +85,35 @@ class Sharded_rw_lock { size_t s_lock() { const size_t shard_no = ut_rnd_interval(0, m_n_shards - 1); - rw_lock_s_lock(&m_shards[shard_no].lock); + rw_lock_s_lock(&m_shards[shard_no]); return shard_no; } ibool s_lock_nowait(size_t &shard_no, const char *file, ulint line) { shard_no = ut_rnd_interval(0, m_n_shards - 1); - return rw_lock_s_lock_nowait(&m_shards[shard_no].lock, file, line); + return rw_lock_s_lock_nowait(&m_shards[shard_no], file, line); } void s_unlock(size_t shard_no) { ut_a(shard_no < m_n_shards); - rw_lock_s_unlock(&m_shards[shard_no].lock); + rw_lock_s_unlock(&m_shards[shard_no]); + } + + /** + Tries to obtain exclusive latch - similar to x_lock(), but non-blocking, and + thus can fail. + @return true iff succeeded to acquire the exclusive latch + */ + bool try_x_lock() { + for (size_t shard_no = 0; shard_no < m_n_shards; ++shard_no) { + if (!rw_lock_x_lock_nowait(&m_shards[shard_no])) { + while (0 < shard_no--) { + rw_lock_x_unlock(&m_shards[shard_no]); + } + return (false); + } + } + return (true); } void x_lock() { @@ -108,23 +126,18 @@ class Sharded_rw_lock { #ifdef UNIV_DEBUG bool s_own(size_t shard_no) const { - return rw_lock_own(&m_shards[shard_no].lock, RW_LOCK_S); + return rw_lock_own(&m_shards[shard_no], RW_LOCK_S); } - bool x_own() const { return rw_lock_own(&m_shards[0].lock, RW_LOCK_X); } + bool x_own() const { return rw_lock_own(&m_shards[0], RW_LOCK_X); } #endif /* !UNIV_DEBUG */ private: - struct Shard { - rw_lock_t lock; - - char pad[INNOBASE_CACHE_LINE_SIZE]; - }; + using Shard = ut::Cacheline_padded; template void for_each(F f) { - std::for_each(m_shards, m_shards + m_n_shards, - [&f](Shard &shard) { f(shard.lock); }); + std::for_each(m_shards, m_shards + m_n_shards, f); } Shard *m_shards = nullptr; diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 1d8b2d3b1328..cab905f81146 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2020, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2012, Facebook Inc. @@ -155,7 +155,8 @@ extern mysql_pfs_key_t trx_mutex_key; extern mysql_pfs_key_t trx_pool_mutex_key; extern mysql_pfs_key_t trx_pool_manager_mutex_key; extern mysql_pfs_key_t temp_pool_manager_mutex_key; -extern mysql_pfs_key_t lock_mutex_key; +extern mysql_pfs_key_t lock_sys_page_mutex_key; +extern mysql_pfs_key_t lock_sys_table_mutex_key; extern mysql_pfs_key_t lock_wait_mutex_key; extern mysql_pfs_key_t trx_sys_mutex_key; extern mysql_pfs_key_t srv_sys_mutex_key; @@ -189,6 +190,7 @@ extern mysql_pfs_key_t buf_block_debug_latch_key; extern mysql_pfs_key_t dict_operation_lock_key; extern mysql_pfs_key_t undo_spaces_lock_key; extern mysql_pfs_key_t rsegs_lock_key; +extern mysql_pfs_key_t lock_sys_global_rw_lock_key; extern mysql_pfs_key_t fil_space_latch_key; extern mysql_pfs_key_t fts_cache_rw_lock_key; extern mysql_pfs_key_t fts_cache_init_rw_lock_key; diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index cc387ea9019e..31a42c5c98ce 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -175,7 +175,15 @@ V lock_sys_wait_mutex Mutex protecting lock timeout data | V -lock_sys_mutex Mutex protecting lock_sys_t +lock_sys->global_sharded_latch Sharded rw-latch protecting lock_sys_t +| +V +lock_sys->table_mutexes Mutexes protecting lock_sys_t table +| lock queues +| +V +lock_sys->page_mutexes Mutexes protecting lock_sys_t page +| lock queues | V trx_sys->mutex Mutex protecting trx_sys_t @@ -264,13 +272,13 @@ enum latch_level_t { SYNC_PAGE_CLEANER, SYNC_PURGE_QUEUE, SYNC_TRX_SYS_HEADER, - SYNC_REC_LOCK, SYNC_THREADS, SYNC_TRX, SYNC_POOL, SYNC_POOL_MANAGER, SYNC_TRX_SYS, - SYNC_LOCK_SYS, + SYNC_LOCK_SYS_SHARDED, + SYNC_LOCK_SYS_GLOBAL, SYNC_LOCK_WAIT_SYS, SYNC_INDEX_ONLINE_LOG, @@ -369,6 +377,10 @@ enum latch_id_t { LATCH_ID_IBUF, LATCH_ID_IBUF_PESSIMISTIC_INSERT, LATCH_ID_LOCK_FREE_HASH, + LATCH_ID_LOCK_SYS_GLOBAL, + LATCH_ID_LOCK_SYS_PAGE, + LATCH_ID_LOCK_SYS_TABLE, + LATCH_ID_LOCK_SYS_WAIT, LATCH_ID_LOG_SN, LATCH_ID_LOG_CHECKPOINTER, LATCH_ID_LOG_CLOSER, @@ -410,8 +422,6 @@ enum latch_id_t { LATCH_ID_TRX_POOL_MANAGER, LATCH_ID_TEMP_POOL_MANAGER, LATCH_ID_TRX, - LATCH_ID_LOCK_SYS, - LATCH_ID_LOCK_SYS_WAIT, LATCH_ID_TRX_SYS, LATCH_ID_SRV_SYS, LATCH_ID_SRV_SYS_TASKS, diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h index b590d5e0a55c..f1f5212227bb 100644 --- a/storage/innobase/include/trx0i_s.h +++ b/storage/innobase/include/trx0i_s.h @@ -233,8 +233,10 @@ int trx_i_s_possibly_fetch_data_into_cache( /** Returns TRUE if the data in the cache is truncated due to the memory limit posed by TRX_I_S_MEM_LIMIT. + @param[in] cache The cache @return true if truncated */ -ibool trx_i_s_cache_is_truncated(trx_i_s_cache_t *cache); /*!< in: cache */ +bool trx_i_s_cache_is_truncated(trx_i_s_cache_t *cache); + /** The maximum length of a resulting lock_id_size in trx_i_s_create_lock_id(), not including the terminating NUL. "%lu:%lu:%lu:%lu:%lu" -> 20*5+4 chars */ diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index d7cc4edfdc09..49aed07851cb 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -162,10 +162,13 @@ UNIV_INLINE trx_id_t trx_read_trx_id( const byte *ptr); /*!< in: pointer to memory from where to read */ -/** Looks for the trx instance with the given id in the rw trx_list. - @return the trx handle or NULL if not found */ +/** Looks for the trx handle with the given id in rw trxs list. + The caller must be holding trx_sys->mutex. + @param[in] trx_id trx id to search for + @return the trx handle or NULL if not found */ UNIV_INLINE -trx_t *trx_get_rw_trx_by_id(trx_id_t trx_id); /*!< in: trx id to search for */ +trx_t *trx_get_rw_trx_by_id(trx_id_t trx_id); + /** Returns the minimum trx id in rw trx list. This is the smallest id for which the trx can possibly be active. (But, you must look at the trx->state to find out if the minimum trx id transaction itself is active, or already diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic index 474759866d34..3dab5984b1cc 100644 --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -179,14 +179,8 @@ trx_id_t trx_read_trx_id( return (mach_read_from_6(ptr)); } -/** Looks for the trx handle with the given id in rw_trx_list. - The caller must be holding trx_sys->mutex. - @return the trx handle or NULL if not found; - the pointer must not be dereferenced unless lock_sys->mutex was - acquired before calling this function and is still being held */ UNIV_INLINE -trx_t *trx_get_rw_trx_by_id(trx_id_t trx_id) /*!< in: trx id to search for */ -{ +trx_t *trx_get_rw_trx_by_id(trx_id_t trx_id) { ut_ad(trx_id > 0); ut_ad(trx_sys_mutex_own()); diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index fa9f9506f46d..94d5f8ffef72 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -33,6 +33,7 @@ this program; if not, write to the Free Software Foundation, Inc., #ifndef trx0trx_h #define trx0trx_h +#include #include #include "ha_prototypes.h" @@ -208,13 +209,14 @@ int trx_recover_for_mysql( XA_recover_txn *txn_list, /*!< in/out: prepared transactions */ ulint len, /*!< in: number of slots in xid_list */ MEM_ROOT *mem_root); /*!< in: memory for table names */ + /** This function is used to find one X/Open XA distributed transaction which is in the prepared state + @param[in] xid X/Open XA transaction identifier @return trx or NULL; on match, the trx->xid will be invalidated; - note that the trx may have been committed, unless the caller is - holding lock_sys->mutex */ -trx_t *trx_get_trx_by_xid( - const XID *xid); /*!< in: X/Open XA transaction identifier */ + note that the trx may have been committed */ +trx_t *trx_get_trx_by_xid(const XID *xid); + /** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ void trx_commit_complete_for_mysql(trx_t *trx); /*!< in/out: transaction */ @@ -260,20 +262,20 @@ void trx_print_low(FILE *f, /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ /** Prints info about a transaction. - The caller must hold lock_sys->mutex and trx_sys->mutex. - When possible, use trx_print() instead. */ -void trx_print_latched( - FILE *f, /*!< in: output stream */ - const trx_t *trx, /*!< in: transaction */ - ulint max_query_len); /*!< in: max query length to print, - or 0 to use the default max length */ +The caller must hold lock_sys exclusive global latch and trx_sys->mutex. +@param[in] f output stream +@param[in] trx transaction +@param[in] max_query_len max query length to print, or 0 to use the default + max length */ +void trx_print_latched(FILE *f, const trx_t *trx, ulint max_query_len); /** Prints info about a transaction. - Acquires and releases lock_sys->mutex and trx_sys->mutex. */ -void trx_print(FILE *f, /*!< in: output stream */ - const trx_t *trx, /*!< in: transaction */ - ulint max_query_len); /*!< in: max query length to print, - or 0 to use the default max length */ +Acquires and releases lock_sys exclusive global latch and trx_sys->mutex. +@param[in] f output stream +@param[in] trx transaction +@param[in] max_query_len max query length to print, or 0 to use the default + max length */ +void trx_print(FILE *f, const trx_t *trx, ulint max_query_len); /** Determine if a transaction is a dictionary operation. @return dictionary operation mode */ @@ -540,15 +542,16 @@ typedef std::vector> lock_pool_t; changed asynchronously. All these operations take place within the context of locking. Therefore state - changes within the locking code must acquire both the lock mutex and the - trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or + changes within the locking code must latch the shard with the wait_lock and + the trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient to only acquire the trx->mutex. To query the state either of the mutexes is sufficient within the locking code and no mutex is required when the query thread is no longer waiting. */ -/** The locks and state of an active transaction. Protected by -lock_sys->mutex, trx->mutex or both. */ +/** The locks and state of an active transaction. +Protected by exclusive lock_sys latch or trx->mutex combined with shared +lock_sys latch (unless stated otherwise for particular field). */ struct trx_lock_t { ulint n_active_thrs; /*!< number of active query threads */ @@ -556,38 +559,56 @@ struct trx_lock_t { == TRX_STATE_ACTIVE: TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT, ... */ + /** Incremented each time a lock is added or removed from the + trx->lock.trx_locks, so that the thread which iterates over the list can spot + a change if it occurred while it was reacquiring latches. + Protected by trx->mutex. */ + uint64_t trx_locks_version; + /** If this transaction is waiting for a lock, then blocking_trx points to a transaction which holds a conflicting lock. - The opposite is not true sometimes, that is: - 1. It is possible that the transaction has trx->lock.wait_lock == null, yet it + It is possible that the transaction has trx->lock.wait_lock == nullptr, yet it has non-null value of trx->lock.blocking_trx. For example this can happen when we are in the process of moving locks from one heap_no to another. This - however is always done while the lock_sys mutex is latched and conceptually it - is true that the blocking_trx is the one for which the transaction waits, even - though temporarily there is no pointer to a particular WAITING lock object. - 2. If the trx is not waiting for any other transaction, this field might - contain some left-over value from previous wait, although we try to keep it - clean to make debugging easier it is not a requirement for correctness of the - deadlock detection, as it is performed only among transactions which are - waiting. - - This field is changed from null to non-null, when holding trx_mutex_own(this) - and lock_sys mutex. + however is always done while the lock_sys shards which contain the queues + involved are latched and conceptually it is true that the blocking_trx is + the one for which the transaction waits, even though temporarily there is no + pointer to a particular WAITING lock object. + + This field is changed from null to non-null, when holding this->mutex and + mutex for lock_sys shard containing the new value of trx->lock.wait_lock. The field is changed from non-null to different non-null value, while holding - lock_sys mutex. - The field is changed from non-null to null, while holding trx_mutex_own(this), - and lock_sys mutex. + mutex for lock_sys shard containing the trx->lock.wait_lock. + The field is changed from non-null to null, while holding this->mutex, + mutex for lock_sys shard containing the old value of trx->lock.wait_lock, + before it was changed to null. + Readers might read it without any latch, but then they should validate the value, i.e. test if it is not-null, and points to a valid trx. - To make any definite judgments it one needs to latch the lock_sys mutex. */ + To make any definite judgments one needs to latch the lock_sys shard + containing the trx->lock.wait_lock. */ std::atomic blocking_trx; - /** If trx execution state is TRX_QUE_LOCK_WAIT, this points to the lock - request, otherwise this is NULL; set to non-NULL when holding both trx->mutex - and lock_sys->mutex; set to NULL when holding lock_sys->mutex; readers should - hold lock_sys->mutex, except when they are holding trx->mutex and - wait_lock==NULL */ - lock_t *wait_lock; + /** The lock request of this transaction is waiting for. + It might be NULL if the transaction is not currently waiting, or if the lock + was temporarily removed during B-tree reorganization and will be recreated in + a different queue. Such move is protected by latching the shards containing + both queues, so the intermediate state should not be observed by readers who + latch the old shard. + + Changes from NULL to non-NULL while holding trx->mutex and latching the shard + containing the new wait_lock value. + Changes from non-NULL to NULL while latching the shard containing the old + wait_lock value. + Never changes from non-NULL to other non-NULL directly. + + Readers should hold exclusive global latch on lock_sys, as in general they + can't know what shard the lock belongs to before reading it. + However, in debug assertions, where we strongly believe to know the value of + this field in advance, we can: + - read without any latch if we believe the value should be NULL + - read latching only the shard containing the wait_lock we expect */ + std::atomic wait_lock; /** Stores the type of the most recent lock for which this trx had to wait. Set to lock_get_type_low(wait_lock) together with wait_lock in @@ -596,7 +617,7 @@ struct trx_lock_t { lock_reset_lock_and_trx_wait() as in lock_wait_suspend_thread() we are interested in reporting the last known value of this field via thd_wait_begin(). When a thread has to wait for a lock, it first releases - lock-sys mutex, and then calls lock_wait_suspend_thread() where among other + lock-sys latch, and then calls lock_wait_suspend_thread() where among other things it tries to report statistic via thd_wait_begin() about the kind of lock (THD_WAIT_ROW_LOCK vs THD_WAIT_TABLE_LOCK) that caused the wait. But there is a possibility that before it gets to call thd_wait_begin() some other @@ -611,8 +632,8 @@ struct trx_lock_t { lock_reset_lock_and_trx_wait() which changes trx->lock.wait_lock to NULL, but then calls lock_rec_add_to_queue() -> RecLock::create() -> RecLock::lock_add() -> lock_set_lock_and_trx_wait() to set it again to the new lock. This all - happens while holding lock-sys mutex, but we read wait_lock_type without this - mutex, so we should not clear the wait_lock_type simply because somebody + happens while holding lock-sys latch, but we read wait_lock_type without this + latch, so we should not clear the wait_lock_type simply because somebody changed wait_lock to NULL. Protected by trx->mutex. */ uint32_t wait_lock_type; @@ -624,16 +645,19 @@ struct trx_lock_t { transaction as a victim in deadlock resolution, it sets this to true. Protected by trx->mutex. */ - time_t wait_started; /*!< lock wait started at this time, - protected only by lock_sys->mutex */ - que_thr_t *wait_thr; /*!< query thread belonging to this - trx that is in QUE_THR_LOCK_WAIT - state. For threads suspended in a - lock wait, this is protected by - lock_sys->mutex. Otherwise, this may - only be modified by the thread that is - serving the running transaction. */ + /** Lock wait started at this time. + Writes under shared lock_sys latch combined with trx->mutex. + Reads require either trx->mutex or exclusive lock_sys latch. */ + time_t wait_started; + + /** query thread belonging to this trx that is in QUE_THR_LOCK_WAIT state. + For threads suspended in a lock wait, this is protected by lock_sys latch for + the wait_lock's shard. + Otherwise, this may only be modified by the thread that is serving the running + transaction. + */ + que_thr_t *wait_thr; /** Pre-allocated record locks. Protected by trx->mutex. */ lock_pool_t rec_pool; @@ -651,7 +675,7 @@ struct trx_lock_t { mem_heap_t *lock_heap; /** Locks requested by the transaction. - Modifications are protected by trx->mutex and lock_sys mutex. + Modifications are protected by trx->mutex and shard of lock_sys mutex. Reads can be performed while holding trx->mutex or exclusive lock_sys latch. One can also check if this list is empty or not from the thread running this transaction without holding any latches, keeping in mind that other threads @@ -673,7 +697,9 @@ struct trx_lock_t { Protected by trx->mutex. */ ib_vector_t *autoinc_locks; - /** number of rec locks in this trx */ + /** Number of rec locks in this trx. + It is modified with shared lock_sys latch. + It is read with exclusive lock_sys latch. */ std::atomic n_rec_locks; /** Used to indicate that every lock of this transaction placed on a record @@ -746,7 +772,7 @@ and lock_trx_release_locks() [invoked by trx_commit()]. * Print of transactions may access transactions not associated with the current thread. The caller must be holding trx_sys->mutex and -lock_sys->mutex. +exclusive global lock_sys latch. * When a transaction handle is in the trx_sys->mysql_trx_list or trx_sys->trx_list, some of its fields must not be modified without @@ -754,7 +780,7 @@ holding trx_sys->mutex exclusively. * The locking code (in particular, deadlock checking and implicit to explicit conversion) will access transactions associated to other -connections. The locks of transactions are protected by lock_sys->mutex +connections. The locks of transactions are protected by lock_sys latches and sometimes by trx->mutex. * Killing of asynchronous transactions. */ @@ -824,7 +850,7 @@ struct trx_t { }; /** Mutex protecting the fields `state` and `lock` (except some fields of - `lock`, which are protected by lock_sys->mutex) */ + `lock`, which are protected by lock_sys latches) */ mutable TrxMutex mutex; /* Note: in_depth was split from in_innodb for fixing a RO @@ -939,10 +965,10 @@ struct trx_t { to check for the view limit for transactions that are committing */ - trx_lock_t lock; /*!< Information about the transaction - locks and state. Protected by - trx->mutex or lock_sys->mutex - or both */ + /** Information about the transaction locks and state. + Protected by trx->mutex or lock_sys latches or both */ + trx_lock_t lock; + bool is_recovered; /*!< 0=normal transaction, 1=recovered, must be rolled back, protected by trx_sys->mutex when @@ -1259,16 +1285,53 @@ struct commit_node_t { /** Test if trx->mutex is owned by the current thread. */ #define trx_mutex_own(t) mutex_own(&t->mutex) -/** Acquire the trx->mutex. */ -#define trx_mutex_enter(t) \ - do { \ - mutex_enter(&t->mutex); \ +#ifdef UNIV_DEBUG +/** +Verifies the invariants and records debug state related to latching rules. +Called during trx_mutex_enter before the actual mutex acquisition. +@param[in] trx The transaction for which trx_mutex_enter(trx) is + called +@param[in] allow_another If false, then no further calls to trx_mutex_enter + are allowed, until trx_mutex_exit(). + If true, then this must be the first trx acquisition + and we will allow one more. +*/ +void trx_before_mutex_enter(const trx_t *trx, bool allow_another); + +/** +Verifies the invariants and records debug state related to latching rules. +Called during trx_mutex_exit before the actual mutex release. +@param[in] trx The transaction for which trx_mutex_exit(trx) is called +*/ +void trx_before_mutex_exit(const trx_t *trx); +#endif + +/** +Please do not use this low-level macro. +Use trx_mutex_enter(t) instead. +In rare cases where you need to take two trx->mutex-es, take the first one +using trx_mutex_enter_first_of_two(t1), and the second one with +trx_mutex(2) +*/ +#define trx_mutex_enter_low(t, first_of_two) \ + do { \ + ut_ad(!trx_mutex_own(t)); \ + ut_d(trx_before_mutex_enter(t, first_of_two)); \ + mutex_enter(&t->mutex); \ } while (0) +/** Acquire the trx->mutex (and promise not to request any more). */ +#define trx_mutex_enter(t) trx_mutex_enter_low(t, false) + +/** Acquire the trx->mutex (and indicate we might request one more). */ +#define trx_mutex_enter_first_of_two(t) trx_mutex_enter_low(t, true) + /** Release the trx->mutex. */ -#define trx_mutex_exit(t) \ - do { \ - mutex_exit(&t->mutex); \ +#define trx_mutex_exit(t) \ + do { \ + ut_ad(trx_mutex_own(t)); \ + ut_d(trx_before_mutex_exit(t)); \ + mutex_exit(&t->mutex); \ } while (0) /** Track if a transaction is executing inside InnoDB code. It acts diff --git a/storage/innobase/include/ut0class_life_cycle.h b/storage/innobase/include/ut0class_life_cycle.h new file mode 100644 index 000000000000..c1d760977f06 --- /dev/null +++ b/storage/innobase/include/ut0class_life_cycle.h @@ -0,0 +1,52 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/** @file include/ut0class_life_cycle.h +Utilities related to class lifecycle. */ + +#ifndef ut0class_life_cycle_h +#define ut0class_life_cycle_h + +namespace ut { + +/** +A utility class which, if inherited from, prevents the descendant class +from being copied, moved, or assigned. +This is useful for guard classes. +*/ +class Non_copyable { + public: + Non_copyable(const Non_copyable &) = delete; + Non_copyable &operator=(const Non_copyable &) = delete; + + protected: + Non_copyable() = default; + ~Non_copyable() = default; /// Protected non-virtual destructor +}; + +} /* namespace ut */ + +#endif /* ut0class_life_cycle_h */ \ No newline at end of file diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h index 39efb13cf3ad..e5ff3568aee0 100644 --- a/storage/innobase/include/ut0counter.h +++ b/storage/innobase/include/ut0counter.h @@ -39,19 +39,13 @@ this program; if not, write to the Free Software Foundation, Inc., #include "univ.i" #include "os0thread.h" +#include "ut0cpu_cache.h" #include "ut0dbg.h" #include #include #include -/** CPU cache line size */ -#ifdef __powerpc__ -#define INNOBASE_CACHE_LINE_SIZE 128 -#else -#define INNOBASE_CACHE_LINE_SIZE 64 -#endif /* __powerpc__ */ - /** Default number of slots to use in ib_counter_t */ #define IB_N_SLOTS 64 @@ -62,7 +56,7 @@ struct generic_indexer_t { /** @return offset within m_counter */ static size_t offset(size_t index) UNIV_NOTHROW { - return (((index % N) + 1) * (INNOBASE_CACHE_LINE_SIZE / sizeof(Type))); + return (((index % N) + 1) * (ut::INNODB_CACHE_LINE_SIZE / sizeof(Type))); } }; @@ -105,7 +99,7 @@ struct single_indexer_t { /** @return offset within m_counter */ static size_t offset(size_t index) UNIV_NOTHROW { ut_ad(N == 1); - return ((INNOBASE_CACHE_LINE_SIZE / sizeof(Type))); + return ((ut::INNODB_CACHE_LINE_SIZE / sizeof(Type))); } /** @return 1 */ @@ -120,7 +114,7 @@ struct single_indexer_t { /** Class for using fuzzy counters. The counter is not protected by any mutex and the results are not guaranteed to be 100% accurate but close enough. Creates an array of counters and separates each element by the -INNOBASE_CACHE_LINE_SIZE bytes */ +ut::INNODB_CACHE_LINE_SIZE bytes */ template class Indexer = default_indexer_t> class ib_counter_t { @@ -133,7 +127,7 @@ class ib_counter_t { bool validate() UNIV_NOTHROW { #ifdef UNIV_DEBUG - size_t n = (INNOBASE_CACHE_LINE_SIZE / sizeof(Type)); + size_t n = (ut::INNODB_CACHE_LINE_SIZE / sizeof(Type)); /* Check that we aren't writing outside our defined bounds. */ for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) { @@ -219,7 +213,7 @@ class ib_counter_t { Indexer m_policy; /** Slot 0 is unused. */ - Type m_counter[(N + 1) * (INNOBASE_CACHE_LINE_SIZE / sizeof(Type))]; + Type m_counter[(N + 1) * (ut::INNODB_CACHE_LINE_SIZE / sizeof(Type))]; }; /** Sharded atomic counter. */ @@ -229,10 +223,10 @@ using Type = uint64_t; using N = std::atomic; -static_assert(INNOBASE_CACHE_LINE_SIZE >= sizeof(N), - "Atomic counter size > INNOBASE_CACHE_LINE_SIZE"); +static_assert(ut::INNODB_CACHE_LINE_SIZE >= sizeof(N), + "Atomic counter size > ut::INNODB_CACHE_LINE_SIZE"); -using Pad = byte[INNOBASE_CACHE_LINE_SIZE - sizeof(N)]; +using Pad = byte[ut::INNODB_CACHE_LINE_SIZE - sizeof(N)]; /** Counter shard. */ struct Shard { diff --git a/storage/innobase/include/ut0cpu_cache.h b/storage/innobase/include/ut0cpu_cache.h new file mode 100644 index 000000000000..40053e02dc3f --- /dev/null +++ b/storage/innobase/include/ut0cpu_cache.h @@ -0,0 +1,56 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/** @file include/ut0cpu_cache.h +Utilities related to CPU cache. */ + +#ifndef ut0cpu_cache_h +#define ut0cpu_cache_h + +#include "ut0ut.h" +namespace ut { + +/** CPU cache line size */ +#ifdef __powerpc__ +constexpr size_t INNODB_CACHE_LINE_SIZE = 128; +#else +constexpr size_t INNODB_CACHE_LINE_SIZE = 64; +#endif /* __powerpc__ */ + +/** +A utility wrapper class, which adds padding at the end of the wrapped structure, +so that the next object after it is guaranteed to be in the next cache line. +This is to avoid false-sharing. +Use this, as opposed to alignas(), to avoid problems with allocators which do +not handle over-aligned types. + */ +template +struct Cacheline_padded : public T { + char pad[INNODB_CACHE_LINE_SIZE]; +}; +} /* namespace ut */ + +#endif /* ut0cpu_cache_h */ diff --git a/storage/innobase/include/ut0link_buf.h b/storage/innobase/include/ut0link_buf.h index b8278bc8997f..f6e939274053 100644 --- a/storage/innobase/include/ut0link_buf.h +++ b/storage/innobase/include/ut0link_buf.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -185,7 +185,7 @@ class Link_buf { std::atomic *m_links; /** Tail pointer in the buffer (expressed in original unit). */ - alignas(INNOBASE_CACHE_LINE_SIZE) std::atomic m_tail; + alignas(ut::INNODB_CACHE_LINE_SIZE) std::atomic m_tail; }; template diff --git a/storage/innobase/include/ut0mpmcbq.h b/storage/innobase/include/ut0mpmcbq.h index ed7c04ef35cc..ce3e82e41d2b 100644 --- a/storage/innobase/include/ut0mpmcbq.h +++ b/storage/innobase/include/ut0mpmcbq.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2017, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -26,6 +26,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #ifndef ut0mpmcbq_h #define ut0mpmcbq_h +#include "ut0cpu_cache.h" + #include /** Multiple producer consumer, bounded queue @@ -182,7 +184,7 @@ class mpmc_bq { } private: - using Pad = byte[INNOBASE_CACHE_LINE_SIZE]; + using Pad = byte[ut::INNODB_CACHE_LINE_SIZE]; struct Cell { std::atomic m_pos; diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index c8e97c7d2621..838520b136c9 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -143,8 +143,8 @@ with (), thus: #include "os0proc.h" #include "os0thread.h" #include "univ.i" -#include "ut0byte.h" /* ut_align */ -#include "ut0counter.h" /* INNOBASE_CACHE_LINE_SIZE */ +#include "ut0byte.h" /* ut_align */ +#include "ut0cpu_cache.h" #include "ut0ut.h" #define OUT_OF_MEMORY_MSG \ @@ -1230,7 +1230,7 @@ class aligned_memory { /** Manages an object that is aligned to specified number of bytes. @tparam T_Type type of the object that is going to be managed @tparam T_Align_to number of bytes to align to */ -template +template class aligned_pointer : public aligned_memory { public: ~aligned_pointer() { @@ -1257,7 +1257,7 @@ class aligned_pointer : public aligned_memory { number of bytes. @tparam T_Type type of the object that is going to be managed @tparam T_Align_to number of bytes to align to */ -template +template class aligned_array_pointer : public aligned_memory { public: /** Allocates aligned memory for new objects. Objects must be trivially diff --git a/storage/innobase/lock/lock0aarch64_atomic.cc b/storage/innobase/lock/lock0aarch64_atomic.cc new file mode 100644 index 000000000000..a546bc606614 --- /dev/null +++ b/storage/innobase/lock/lock0aarch64_atomic.cc @@ -0,0 +1,27 @@ +/***************************************************************************** + +Copyright (c) 2020, Huawei Technologies Co., Ltd. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +*****************************************************************************/ + +#include "lock0aarch64_atomic.h" + +lint word_add_fetch(volatile lint *word, ulint amount) { + asm volatile ( + "ldaddal %0, x3, [%1]\n\t" + "add %0, x3, %0" + :"+r"(amount) + :"r"(word) + :"x3","memory" + ); + return amount; +} \ No newline at end of file diff --git a/storage/innobase/lock/lock0guards.cc b/storage/innobase/lock/lock0guards.cc new file mode 100644 index 000000000000..e09454459521 --- /dev/null +++ b/storage/innobase/lock/lock0guards.cc @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "lock0guards.h" +#include "lock0priv.h" +#include "sync0rw.h" + +namespace locksys { + +/* Global_exclusive_latch_guard */ + +Global_exclusive_latch_guard::Global_exclusive_latch_guard() { + lock_sys->latches.global_latch.x_lock(); +} + +Global_exclusive_latch_guard::~Global_exclusive_latch_guard() { + lock_sys->latches.global_latch.x_unlock(); +} + +/* Global_exclusive_try_latch */ + +Global_exclusive_try_latch::Global_exclusive_try_latch() { + m_owns_exclusive_global_latch = lock_sys->latches.global_latch.try_x_lock(); +} + +Global_exclusive_try_latch::~Global_exclusive_try_latch() { + if (m_owns_exclusive_global_latch) { + lock_sys->latches.global_latch.x_unlock(); + m_owns_exclusive_global_latch = false; + } +} + +/* Shard_naked_latch_guard */ + +Shard_naked_latch_guard::Shard_naked_latch_guard(Lock_mutex &shard_mutex) + : m_shard_mutex{shard_mutex} { + ut_ad(owns_shared_global_latch()); + mutex_enter(&m_shard_mutex); +} + +Shard_naked_latch_guard::Shard_naked_latch_guard(const dict_table_t &table) + : Shard_naked_latch_guard{lock_sys->latches.table_shards.get_mutex(table)} { +} + +Shard_naked_latch_guard::Shard_naked_latch_guard(const page_id_t &page_id) + : Shard_naked_latch_guard{ + lock_sys->latches.page_shards.get_mutex(page_id)} {} + +Shard_naked_latch_guard::~Shard_naked_latch_guard() { + mutex_exit(&m_shard_mutex); +} + +/* Global_shared_latch_guard */ + +Global_shared_latch_guard::Global_shared_latch_guard() { + lock_sys->latches.global_latch.s_lock(); +} + +Global_shared_latch_guard::~Global_shared_latch_guard() { + lock_sys->latches.global_latch.s_unlock(); +} + +/* Shard_naked_latches_guard */ + +Shard_naked_latches_guard::Shard_naked_latches_guard(Lock_mutex &shard_mutex_a, + Lock_mutex &shard_mutex_b) + : m_shard_mutex_1{*std::min(&shard_mutex_a, &shard_mutex_b, MUTEX_ORDER)}, + m_shard_mutex_2{*std::max(&shard_mutex_a, &shard_mutex_b, MUTEX_ORDER)} { + ut_ad(owns_shared_global_latch()); + if (&m_shard_mutex_1 != &m_shard_mutex_2) { + mutex_enter(&m_shard_mutex_1); + } + mutex_enter(&m_shard_mutex_2); +} + +Shard_naked_latches_guard::Shard_naked_latches_guard(const buf_block_t &block_a, + const buf_block_t &block_b) + : Shard_naked_latches_guard{ + lock_sys->latches.page_shards.get_mutex(block_a.get_page_id()), + lock_sys->latches.page_shards.get_mutex(block_b.get_page_id())} {} + +Shard_naked_latches_guard::~Shard_naked_latches_guard() { + mutex_exit(&m_shard_mutex_2); + if (&m_shard_mutex_1 != &m_shard_mutex_2) { + mutex_exit(&m_shard_mutex_1); + } +} + +} // namespace locksys diff --git a/storage/innobase/lock/lock0iter.cc b/storage/innobase/lock/lock0iter.cc index 660b139f6323..8e9d399340a0 100644 --- a/storage/innobase/lock/lock0iter.cc +++ b/storage/innobase/lock/lock0iter.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -55,7 +55,8 @@ void lock_queue_iterator_reset( ulint bit_no) /*!< in: record number in the heap */ { - ut_ad(lock_mutex_own()); + ut_ad(lock != nullptr); + ut_ad(locksys::owns_lock_shard(lock)); iter->current_lock = lock; @@ -85,7 +86,8 @@ const lock_t *lock_queue_iterator_get_prev( { const lock_t *prev_lock; - ut_ad(lock_mutex_own()); + ut_ad(iter->current_lock != nullptr); + ut_ad(locksys::owns_lock_shard(iter->current_lock)); switch (lock_get_type_low(iter->current_lock)) { case LOCK_REC: diff --git a/storage/innobase/lock/lock0latches.cc b/storage/innobase/lock/lock0latches.cc new file mode 100644 index 000000000000..f2a09ba2317e --- /dev/null +++ b/storage/innobase/lock/lock0latches.cc @@ -0,0 +1,107 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "lock0latches.h" +#include "lock0lock.h" +#include "lock0priv.h" + +namespace locksys { + +size_t Latches::Page_shards::get_shard(const page_id_t &page_id) { + /* We always use lock_rec_hash regardless of the exact type of the lock. + It may happen that the lock is a predicate lock, in which case, + it would make more sense to use hash_calc_hash with proper hash table + size. The current implementation works, because the size of all three + hashmaps is always the same. This allows an interface with less arguments. + */ + ut_ad(lock_sys->rec_hash->n_cells == lock_sys->prdt_hash->n_cells); + ut_ad(lock_sys->rec_hash->n_cells == lock_sys->prdt_page_hash->n_cells); + return lock_rec_hash(page_id.space(), page_id.page_no()) % SHARDS_COUNT; +} + +const Lock_mutex &Latches::Page_shards::get_mutex( + const page_id_t &page_id) const { + return mutexes[get_shard(page_id)]; +} + +Lock_mutex &Latches::Page_shards::get_mutex(const page_id_t &page_id) { + /* See "Effective C++ item 3: Use const whenever possible" for explanation of + this pattern, which avoids code duplication by reusing const version. */ + return const_cast( + const_cast(this)->get_mutex(page_id)); +} + +size_t Latches::Table_shards::get_shard(const dict_table_t &table) { + return table.id % SHARDS_COUNT; +} + +const Lock_mutex &Latches::Table_shards::get_mutex( + const dict_table_t &table) const { + return mutexes[get_shard(table)]; +} + +Lock_mutex &Latches::Table_shards::get_mutex(const dict_table_t &table) { + /* See "Effective C++ item 3: Use const whenever possible" for explanation of + this pattern, which avoids code duplication by reusing const version. */ + return const_cast( + const_cast(this)->get_mutex(table)); +} + +thread_local size_t Latches::Unique_sharded_rw_lock::m_shard_id{NOT_IN_USE}; + +Latches::Unique_sharded_rw_lock::Unique_sharded_rw_lock() { + rw_lock.create(lock_sys_global_rw_lock_key, SYNC_LOCK_SYS_GLOBAL, 64); +} + +Latches::Unique_sharded_rw_lock::~Unique_sharded_rw_lock() { rw_lock.free(); } + +Latches::Page_shards::Page_shards() { + for (size_t i = 0; i < SHARDS_COUNT; ++i) { + mutex_create(LATCH_ID_LOCK_SYS_PAGE, mutexes + i); + } +} + +Latches::Page_shards::~Page_shards() { + for (size_t i = 0; i < SHARDS_COUNT; ++i) { + mutex_destroy(mutexes + i); + } +} + +Latches::Table_shards::Table_shards() { + for (size_t i = 0; i < SHARDS_COUNT; ++i) { + mutex_create(LATCH_ID_LOCK_SYS_TABLE, mutexes + i); + } +} + +Latches::Table_shards::~Table_shards() { + for (size_t i = 0; i < SHARDS_COUNT; ++i) { + mutex_destroy(mutexes + i); + } +} + +} // namespace locksys diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 64bed968ebe9..ced2e35f2d31 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -43,6 +43,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "btr0btr.h" #include "current_thd.h" +#include "debug_sync.h" /* CONDITIONAL_SYNC_POINT */ #include "dict0boot.h" #include "dict0mem.h" #include "ha_prototypes.h" @@ -91,7 +92,9 @@ static const std::map lock_constant_names{ }; /** Used by lock_get_mode_str to cache results. Strings pointed by these pointers might be in use by performance schema and thus can not be freed -until the very end */ +until the very end. +Protected by exclusive global lock_sys latch. +*/ static std::unordered_map lock_cached_lock_mode_names; /** A static class for reporting notifications about deadlocks */ @@ -150,9 +153,32 @@ class Deadlock_notifier { }; #ifdef UNIV_DEBUG -/** Validates the lock system. - @return true if ok */ -static bool lock_validate(); +namespace locksys { + +bool owns_exclusive_global_latch() { + return lock_sys->latches.owns_exclusive_global_latch(); +} + +bool owns_shared_global_latch() { + return lock_sys->latches.owns_shared_global_latch(); +} + +bool owns_page_shard(const page_id_t &page_id) { + return lock_sys->latches.owns_page_shard(page_id); +} + +bool owns_table_shard(const dict_table_t &table) { + return lock_sys->latches.owns_table_shard(table); +} + +bool owns_lock_shard(const lock_t *lock) { + if (lock->is_record_lock()) { + return lock_sys->latches.owns_page_shard(lock->rec_lock.get_page_id()); + } else { + return lock_sys->latches.owns_table_shard(*lock->tab_lock.table); + } +} +} // namespace locksys /** Validates the record lock queues on a page. @return true if ok */ @@ -294,14 +320,14 @@ void lock_sys_create( lock_sys = static_cast(ut_zalloc_nokey(lock_sys_sz)); + new (lock_sys) lock_sys_t{}; + void *ptr = &lock_sys[1]; lock_sys->waiting_threads = static_cast(ptr); lock_sys->last_slot = lock_sys->waiting_threads; - mutex_create(LATCH_ID_LOCK_SYS, &lock_sys->mutex); - mutex_create(LATCH_ID_LOCK_SYS_WAIT, &lock_sys->wait_mutex); lock_sys->timeout_event = os_event_create(nullptr); @@ -328,13 +354,28 @@ static ulint lock_rec_lock_fold(const lock_t *lock) { void lock_sys_resize(ulint n_cells) { hash_table_t *old_hash; - lock_mutex_enter(); + /* We will rearrange locks between buckets and change the parameters of hash + function used in sharding of latches, so we have to prevent everyone from + accessing lock sys queues, or even computing shard id. */ + locksys::Global_exclusive_latch_guard guard{}; old_hash = lock_sys->rec_hash; lock_sys->rec_hash = hash_create(n_cells); HASH_MIGRATE(old_hash, lock_sys->rec_hash, lock_t, hash, lock_rec_lock_fold); hash_table_free(old_hash); + DBUG_EXECUTE_IF("syncpoint_after_lock_sys_resize_rec_hash", { + /* A workaround for buf_resize_thread() not using create_thd(). + TBD: if buf_resize_thread() were to use create_thd() then should it be + instrumented (together or instead of os_thread_create instrumentation)? */ + ut_ad(current_thd == nullptr); + THD *thd = create_thd(false, true, true, PSI_NOT_INSTRUMENTED); + ut_ad(current_thd == thd); + CONDITIONAL_SYNC_POINT("after_lock_sys_resize_rec_hash"); + destroy_thd(thd); + ut_ad(current_thd == nullptr); + }); + old_hash = lock_sys->prdt_hash; lock_sys->prdt_hash = hash_create(n_cells); HASH_MIGRATE(old_hash, lock_sys->prdt_hash, lock_t, hash, lock_rec_lock_fold); @@ -366,8 +407,6 @@ void lock_sys_resize(ulint n_cells) { } mutex_exit(&buf_pool->LRU_list_mutex); } - - lock_mutex_exit(); } /** Closes the lock system at database shutdown. */ @@ -383,7 +422,6 @@ void lock_sys_close(void) { os_event_destroy(lock_sys->timeout_event); - mutex_destroy(&lock_sys->mutex); mutex_destroy(&lock_sys->wait_mutex); srv_slot_t *slot = lock_sys->waiting_threads; @@ -397,6 +435,9 @@ void lock_sys_close(void) { ut_free(const_cast(cached_lock_mode_name.second)); } lock_cached_lock_mode_names.clear(); + + lock_sys->~lock_sys_t(); + ut_free(lock_sys); lock_sys = nullptr; @@ -411,9 +452,9 @@ ulint lock_get_size(void) { return ((ulint)sizeof(lock_t)); } UNIV_INLINE void lock_set_lock_and_trx_wait(lock_t *lock) { auto trx = lock->trx; - ut_a(trx->lock.wait_lock == nullptr); - ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(trx)); + ut_a(trx->lock.wait_lock == nullptr); + ut_ad(locksys::owns_lock_shard(lock)); trx->lock.wait_lock = lock; trx->lock.wait_lock_type = lock_get_type_low(lock); @@ -630,19 +671,13 @@ void lock_rec_trx_wait(lock_t *lock, ulint i, ulint type) { } } -/** Determines if there are explicit record locks on a page. - @return an explicit record lock on the page, or NULL if there are none */ -lock_t *lock_rec_expl_exist_on_page(space_id_t space, /*!< in: space id */ - page_no_t page_no) /*!< in: page number */ -{ +bool lock_rec_expl_exist_on_page(space_id_t space, page_no_t page_no) { lock_t *lock; - - lock_mutex_enter(); + locksys::Shard_latch_guard guard{page_id_t{space, page_no}}; /* Only used in ibuf pages, so rec_hash is good enough */ lock = lock_rec_get_first_on_page_addr(lock_sys->rec_hash, space, page_no); - lock_mutex_exit(); - return (lock); + return (lock != nullptr); } /** Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock @@ -690,7 +725,7 @@ const lock_t *lock_rec_get_prev( lock_t *found_lock = nullptr; hash_table_t *hash; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(in_lock->rec_lock.get_page_id())); ut_ad(lock_get_type_low(in_lock) == LOCK_REC); space = in_lock->rec_lock.space; @@ -726,7 +761,7 @@ const lock_t *lock_rec_get_prev( UNIV_INLINE const lock_t *lock_rec_has_expl(ulint precise_mode, const buf_block_t *block, ulint heap_no, const trx_t *trx) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S || (precise_mode & LOCK_MODE_MASK) == LOCK_X); ut_ad( @@ -765,7 +800,7 @@ static const lock_t *lock_rec_other_has_expl_req( requests by all transactions are taken into account */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(mode == LOCK_X || mode == LOCK_S); /* Only GAP lock can be on SUPREMUM, and we are not looking @@ -799,7 +834,7 @@ static const lock_t *lock_rec_other_has_conflicting( ulint heap_no, /*!< in: heap number of the record */ const trx_t *trx) /*!< in: our transaction */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(!(mode & ~(ulint)(LOCK_MODE_MASK | LOCK_GAP | LOCK_REC_NOT_GAP | LOCK_INSERT_INTENTION))); ut_ad(!(mode & LOCK_PREDICATE)); @@ -815,20 +850,20 @@ static const lock_t *lock_rec_other_has_conflicting( /** Checks if some transaction has an implicit x-lock on a record in a secondary index. + @param[in] rec user record + @param[in] index secondary index + @param[in] offsets rec_get_offsets(rec, index) @return transaction id of the transaction which has the x-lock, or 0; NOTE that this function can return false positives but never false - negatives. The caller must confirm all positive results by calling - trx_is_active(). */ -static trx_t *lock_sec_rec_some_has_impl( - const rec_t *rec, /*!< in: user record */ - dict_index_t *index, /*!< in: secondary index */ - const ulint *offsets) /*!< in: rec_get_offsets(rec, index) */ -{ + negatives. The caller must confirm all positive results by checking if the trx + is still active. */ +static trx_t *lock_sec_rec_some_has_impl(const rec_t *rec, dict_index_t *index, + const ulint *offsets) { trx_t *trx; trx_id_t max_trx_id; const page_t *page = page_align(rec); - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); ut_ad(!trx_sys_mutex_own()); ut_ad(!index->is_clustered()); ut_ad(page_rec_is_user_rec(rec)); @@ -875,7 +910,8 @@ static bool lock_rec_other_trx_holds_expl(ulint precise_mode, const trx_t *trx, const buf_block_t *block) { bool holds = false; - lock_mutex_enter(); + /* We will inspect locks from various shards when inspecting transactions. */ + locksys::Global_exclusive_latch_guard guard{}; /* If trx_rw_is_active returns non-null impl_trx it only means that impl_trx was active at some moment during the call, but might already be in TRX_STATE_COMMITTED_IN_MEMORY when we execute the body of the if. @@ -903,20 +939,15 @@ static bool lock_rec_other_trx_holds_expl(ulint precise_mode, const trx_t *trx, mutex_exit(&trx_sys->mutex); } - lock_mutex_exit(); - return (holds); } #endif /* UNIV_DEBUG */ -/** Return approximate number or record locks (bits set in the bitmap) for - this transaction. Since delete-marked records may be removed, the - record count will not be precise. - The caller must be holding lock_sys->mutex. */ -ulint lock_number_of_rows_locked( - const trx_lock_t *trx_lock) /*!< in: transaction locks */ -{ - ut_ad(lock_mutex_own()); +ulint lock_number_of_rows_locked(const trx_lock_t *trx_lock) { + /* We need exclusive lock_sys access, because trx_lock->n_rec_locks is + modified while holding sharded lock only, so we need to disable all writers + for this number to be meaningful */ + ut_ad(locksys::owns_exclusive_global_latch()); return (trx_lock->n_rec_locks); } @@ -932,7 +963,7 @@ ulint lock_number_of_tables_locked(const trx_t *trx) { /** Do some checks and prepare for creating a new record lock */ void RecLock::prepare() const { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(m_rec_id.get_page_id())); ut_ad(m_trx == thr_get_trx(m_thr)); /* Test if there already is some other reason to suspend thread: @@ -969,7 +1000,7 @@ Create the lock instance @return a record lock instance */ lock_t *RecLock::lock_alloc(trx_t *trx, dict_index_t *index, ulint mode, const RecID &rec_id, ulint size) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); /* We are about to modify structures in trx->lock which needs trx->mutex */ ut_ad(trx_mutex_own(trx)); @@ -991,7 +1022,7 @@ lock_t *RecLock::lock_alloc(trx_t *trx, dict_index_t *index, ulint mode, lock->index = index; /* Note the creation timestamp */ - ut_d(lock->m_seq = ++lock_sys->m_seq); + ut_d(lock->m_seq = lock_sys->m_seq.fetch_add(1)); /* Setup the lock attributes */ @@ -1013,9 +1044,9 @@ lock_t *RecLock::lock_alloc(trx_t *trx, dict_index_t *index, ulint mode, memset(&lock[1], 0x0, size); } - rec_lock.space = rec_id.m_space_id; + rec_lock.space = rec_id.get_page_id().space(); - rec_lock.page_no = rec_id.m_page_no; + rec_lock.page_no = rec_id.get_page_id().page_no(); /* Set the bit corresponding to rec */ @@ -1036,7 +1067,8 @@ static void lock_rec_insert_to_waiting(hash_table_t *lock_hash, lock_t *lock, const RecID &rec_id) { ut_ad(lock->is_waiting()); ut_ad(rec_id.matches(lock)); - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); const ulint fold = rec_id.fold(); HASH_INSERT(lock_t, hash, lock_hash, fold, lock); @@ -1049,7 +1081,8 @@ static void lock_rec_insert_to_waiting(hash_table_t *lock_hash, lock_t *lock, static void lock_rec_insert_to_granted(hash_table_t *lock_hash, lock_t *lock, const RecID &rec_id) { ut_ad(rec_id.matches(lock)); - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); ut_ad(!lock->is_waiting()); /* Move the target lock to the head of the list. */ @@ -1063,17 +1096,46 @@ static void lock_rec_insert_to_granted(hash_table_t *lock_hash, lock_t *lock, cell->node = lock; lock->hash = next; } +namespace locksys { +/** +Adds the lock to the list of trx's locks. +Requires lock->trx to be already set. +Bumps the trx_lock_version. +@param[in,out] lock The lock that we want to add to lock->trx->lock.trx_locks +*/ +static void add_to_trx_locks(lock_t *lock) { + ut_ad(lock->trx != nullptr); + ut_ad(trx_mutex_own(lock->trx)); + UT_LIST_ADD_LAST(lock->trx->lock.trx_locks, lock); + lock->trx->lock.trx_locks_version++; +} + +/** +Removes the lock from the list of trx's locks. +Bumps the trx_lock_version. +@param[in,out] lock The lock that we want to remove from + lock->trx->lock.trx_locks +*/ +static void remove_from_trx_locks(lock_t *lock) { + ut_ad(lock->trx != nullptr); + ut_ad(trx_mutex_own(lock->trx)); + UT_LIST_REMOVE(lock->trx->lock.trx_locks, lock); + lock->trx->lock.trx_locks_version++; +} +} // namespace locksys void RecLock::lock_add(lock_t *lock) { ut_ad((lock->type_mode | LOCK_REC) == (m_mode | LOCK_REC)); - ut_ad(lock_mutex_own()); + ut_ad(m_rec_id.matches(lock)); + ut_ad(locksys::owns_page_shard(m_rec_id.get_page_id())); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); ut_ad(trx_mutex_own(lock->trx)); bool wait = m_mode & LOCK_WAIT; hash_table_t *lock_hash = lock_hash_get(m_mode); - ++lock->index->table->n_rec_locks; + lock->index->table->n_rec_locks.fetch_add(1, std::memory_order_relaxed); if (!wait) { lock_rec_insert_to_granted(lock_hash, lock, m_rec_id); @@ -1090,7 +1152,7 @@ void RecLock::lock_add(lock_t *lock) { #endif /* HAVE_PSI_DATA_LOCK_INTERFACE */ #endif /* HAVE_PSI_THREAD_INTERFACE */ - UT_LIST_ADD_LAST(lock->trx->lock.trx_locks, lock); + locksys::add_to_trx_locks(lock); if (wait) { lock_set_lock_and_trx_wait(lock); @@ -1102,7 +1164,7 @@ void RecLock::lock_add(lock_t *lock) { @param[in] prdt Predicate lock (optional) @return a new lock instance */ lock_t *RecLock::create(trx_t *trx, const lock_prdt_t *prdt) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(m_rec_id.get_page_id())); /* Ensure that another transaction doesn't access the trx lock state and lock data structures while we are adding the @@ -1193,7 +1255,7 @@ static void lock_mark_trx_for_rollback(hit_list_t &hit_list, trx_id_t hp_trx_id, static void lock_create_wait_for_edge(trx_t *waiter, trx_t *blocker) { ut_ad(trx_mutex_own(waiter)); ut_ad(waiter->lock.wait_lock != nullptr); - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_lock_shard(waiter->lock.wait_lock)); ut_ad(waiter->lock.blocking_trx.load() == nullptr); /* We don't call lock_wait_request_check_for_cycles() here as it would be slightly premature: the trx is not yet inserted into a slot of @@ -1209,7 +1271,7 @@ static void lock_create_wait_for_edge(trx_t *waiter, trx_t *blocker) { Setup the requesting transaction state for lock grant @param[in,out] lock Lock for which to change state */ void RecLock::set_wait_state(lock_t *lock) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); ut_ad(m_trx == lock->trx); ut_ad(trx_mutex_own(m_trx)); ut_ad(lock_get_wait(lock)); @@ -1225,7 +1287,7 @@ void RecLock::set_wait_state(lock_t *lock) { } dberr_t RecLock::add_to_waitq(const lock_t *wait_for, const lock_prdt_t *prdt) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(m_rec_id.get_page_id())); ut_ad(m_trx == thr_get_trx(m_thr)); /* It is not that the body of this function requires trx->mutex, but some of @@ -1269,9 +1331,10 @@ rows (and thus: queues) this function moves it to the front of whole bucket. @param [in] lock a granted lock to be moved @param [in] rec_id record id which specifies particular queue and bucket */ static void lock_rec_move_granted_to_front(lock_t *lock, const RecID &rec_id) { - ut_ad(lock_mutex_own()); ut_ad(!lock->is_waiting()); ut_ad(rec_id.matches(lock)); + ut_ad(locksys::owns_page_shard(rec_id.get_page_id())); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); const auto hash_table = lock->hash_table(); HASH_DELETE(lock_t, hash, hash_table, rec_id.fold(), lock); @@ -1294,7 +1357,7 @@ UNIV_INLINE lock_t *lock_rec_find_similar_on_page(uint32_t type_mode, size_t heap_no, lock_t *lock, const trx_t *trx, bool &found_waiter_before_lock) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); found_waiter_before_lock = false; for (/* No op */; lock != nullptr; lock = lock_rec_get_next_on_page(lock)) { if (lock->trx == trx && lock->type_mode == type_mode && @@ -1327,7 +1390,7 @@ static void lock_rec_add_to_queue(ulint type_mode, const buf_block_t *block, trx_t *trx, const bool we_own_trx_mutex = false) { #ifdef UNIV_DEBUG - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(we_own_trx_mutex == trx_mutex_own(trx)); ut_ad(index->is_clustered() || @@ -1436,7 +1499,7 @@ lock_rec_req_status lock_rec_lock_fast( dict_index_t *index, /*!< in: index of record */ que_thr_t *thr) /*!< in: query thread */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(!srv_read_only_mode); ut_ad((LOCK_MODE_MASK & mode) != LOCK_S || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); @@ -1549,7 +1612,7 @@ DB_SKIP_LOCKED, or DB_LOCK_NOWAIT */ static dberr_t lock_rec_lock_slow(bool impl, select_mode sel_mode, ulint mode, const buf_block_t *block, ulint heap_no, dict_index_t *index, que_thr_t *thr) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(!srv_read_only_mode); ut_ad((LOCK_MODE_MASK & mode) != LOCK_S || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); @@ -1663,7 +1726,7 @@ DB_SKIP_LOCKED, or DB_LOCK_NOWAIT */ static dberr_t lock_rec_lock(bool impl, select_mode sel_mode, ulint mode, const buf_block_t *block, ulint heap_no, dict_index_t *index, que_thr_t *thr) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(!srv_read_only_mode); ut_ad((LOCK_MODE_MASK & mode) != LOCK_S || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); @@ -1711,7 +1774,7 @@ static const lock_t *lock_rec_has_to_wait_in_queue( ulint bit_offset; hash_table_t *hash; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(wait_lock->rec_lock.get_page_id())); ut_ad(lock_get_wait(wait_lock)); ut_ad(lock_get_type_low(wait_lock) == LOCK_REC); @@ -1739,10 +1802,12 @@ static const lock_t *lock_rec_has_to_wait_in_queue( } /** Grants a lock to a waiting lock request and releases the waiting - transaction. The caller must hold lock_sys->mutex but not lock->trx->mutex. */ -static void lock_grant(lock_t *lock) /*!< in/out: waiting lock request */ -{ - ut_ad(lock_mutex_own()); +transaction. The caller must hold lock_sys latch for the shard containing the +lock, but not the lock->trx->mutex. +@param[in,out] lock waiting lock request + */ +static void lock_grant(lock_t *lock) { + ut_ad(locksys::owns_lock_shard(lock)); ut_ad(!trx_mutex_own(lock->trx)); trx_mutex_enter(lock->trx); @@ -1754,6 +1819,7 @@ static void lock_grant(lock_t *lock) /*!< in/out: waiting lock request */ ib::error(ER_IB_MSG_637) << "Transaction already had an" << " AUTO-INC lock!"; } else { + ut_ad(table->autoinc_trx == nullptr); table->autoinc_trx = lock->trx; ib_vector_push(lock->trx->lock.autoinc_locks, &lock); @@ -1774,18 +1840,39 @@ void lock_make_trx_hit_list(trx_t *hp_trx, hit_list_t &hit_list) { const trx_id_t hp_trx_id = hp_trx->id; ut_ad(trx_can_be_handled_by_current_thread(hp_trx)); ut_ad(trx_is_high_priority(hp_trx)); - const lock_t *lock = hp_trx->lock.wait_lock; - bool waits_for_record = (nullptr != lock && lock->is_record_lock()); + /* To avoid slow procedure involving global exclusive latch below, we first + check if this transaction is waiting for a lock at all. It's unsafe to read + hp->lock.wait_lock without latching whole lock_sys as it might temporarily + change to NULL during a concurrent B-tree reorganization, even though the + trx actually is still waiting. + TBD: Is it safe to use hp_trx->lock.que_state == TRX_QUE_LOCK_WAIT given that + que_state is not atomic, and writes to it happen without trx->mutex ? */ + const bool is_waiting = (hp_trx->lock.blocking_trx.load() != nullptr); trx_mutex_exit(hp_trx); - if (!waits_for_record) { + if (!is_waiting) { return; } - lock_mutex_enter(); + /* Current implementation of lock_make_trx_hit_list requires latching whole + lock_sys for following reasons: + 1. it may call lock_cancel_waiting_and_release on a lock from completely + different shard of lock_sys than hp_trx->lock.wait_lock. Trying to latch + this other shard might create a deadlock cycle if it violates ordering of + shard latches (and there is 50% chance it will violate it). Moreover the + lock_cancel_waiting_and_release() requires an exclusive latch to avoid + deadlocks among trx->mutex-es, and trx->lock.wait_lock might be a table lock, + in which case exclusive latch is also needed to traverse table locks. + 2. it may call trx_mutex_enter on a transaction which is waiting for a + lock, which violates one of assumptions used in the proof that a deadlock due + to acquiring trx->mutex-es is impossible + 3. it attempts to read hp_trx->lock.wait_lock which might be modified by a + thread during B-tree reorganization when moving locks between queues + 4. it attempts to operate on trx->lock.wait_lock of other transactions */ + locksys::Global_exclusive_latch_guard guard{}; /* Check again */ - if (lock != hp_trx->lock.wait_lock) { - lock_mutex_exit(); + const lock_t *lock = hp_trx->lock.wait_lock; + if (lock == nullptr || !lock->is_record_lock()) { return; } RecID rec_id{lock, lock_rec_find_set_bit(lock)}; @@ -1842,8 +1929,6 @@ void lock_make_trx_hit_list(trx_t *hp_trx, hit_list_t &hit_list) { return true; }, lock->hash_table()); - - lock_mutex_exit(); } /** Cancels a waiting record lock request and releases the waiting transaction @@ -1852,8 +1937,8 @@ void lock_make_trx_hit_list(trx_t *hp_trx, hit_list_t &hit_list) { static void lock_rec_cancel( lock_t *lock) /*!< in: waiting record lock request */ { - ut_ad(lock_mutex_own()); ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); /* Reset the bit (there can be only one set bit) in the lock bitmap */ lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock)); @@ -1873,7 +1958,8 @@ waiting_lock->trx points to blocking_lock->trx wait */ static void lock_update_wait_for_edge(const lock_t *waiting_lock, const lock_t *blocking_lock) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_lock_shard(waiting_lock)); + ut_ad(locksys::owns_lock_shard(blocking_lock)); ut_ad(waiting_lock->is_waiting()); ut_ad(lock_has_to_wait(waiting_lock, blocking_lock)); /* Still needs to wait, but perhaps the reason has changed */ @@ -1898,7 +1984,7 @@ static const lock_t *lock_rec_has_to_wait_for_granted( const size_t new_granted_index) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(wait_lock->rec_lock.get_page_id())); ut_ad(wait_lock->is_record_lock()); ut_ad(new_granted_index <= granted.size()); @@ -1945,8 +2031,8 @@ lock was (or still is) held */ static void lock_rec_grant_by_heap_no(lock_t *in_lock, ulint heap_no) { const auto hash_table = in_lock->hash_table(); - ut_ad(lock_mutex_own()); ut_ad(in_lock->is_record_lock()); + ut_ad(locksys::owns_page_shard(in_lock->rec_lock.get_page_id())); using LockDescriptorEx = std::pair; /* Preallocate for 4 lists with 32 locks. */ @@ -2164,32 +2250,27 @@ static void lock_rec_dequeue_from_page(lock_t *in_lock) { void lock_rec_discard(lock_t *in_lock) { space_id_t space; page_no_t page_no; - trx_lock_t *trx_lock; - ut_ad(lock_mutex_own()); ut_ad(lock_get_type_low(in_lock) == LOCK_REC); - - trx_lock = &in_lock->trx->lock; + ut_ad(locksys::owns_page_shard(in_lock->rec_lock.get_page_id())); space = in_lock->rec_lock.space; page_no = in_lock->rec_lock.page_no; - ut_ad(in_lock->index->table->n_rec_locks > 0); - in_lock->index->table->n_rec_locks--; + ut_ad(in_lock->index->table->n_rec_locks.load() > 0); + in_lock->index->table->n_rec_locks.fetch_sub(1, std::memory_order_relaxed); /* We want the state of lock queue and trx_locks list to be synchronized atomically from the point of view of people using trx->mutex, so we perform - HASH_DELETE and UT_LIST_REMOVE while holding trx->mutex. - It might be the case that we already hold trx->mutex here, for example if we - came here from lock_release(trx). */ + HASH_DELETE and UT_LIST_REMOVE while holding trx->mutex. */ ut_ad(trx_mutex_own(in_lock->trx)); + locksys::remove_from_trx_locks(in_lock); + HASH_DELETE(lock_t, hash, lock_hash_get(in_lock->type_mode), lock_rec_fold(space, page_no), in_lock); - UT_LIST_REMOVE(trx_lock->trx_locks, in_lock); - MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); } @@ -2229,7 +2310,7 @@ void lock_rec_free_all_from_discard_page( space_id_t space; page_no_t page_no; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); space = block->page.id.space(); page_no = block->page.id.page_no(); @@ -2252,7 +2333,7 @@ static void lock_rec_reset_and_release_wait_low( { lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); for (lock = lock_rec_get_first(hash, block, heap_no); lock != nullptr; lock = lock_rec_get_next(heap_no, lock)) { @@ -2308,7 +2389,8 @@ static void lock_rec_inherit_to_gap( { lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(heir_block->get_page_id())); + ut_ad(locksys::owns_page_shard(block->get_page_id())); /* If session is using READ COMMITTED or READ UNCOMMITTED isolation level, we do not want locks set by an UPDATE or a DELETE to be @@ -2367,7 +2449,7 @@ static void lock_rec_inherit_to_gap_if_gap_lock( { lock_t *lock; - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; for (lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); lock != nullptr; lock = lock_rec_get_next(heap_no, lock)) { @@ -2382,8 +2464,6 @@ static void lock_rec_inherit_to_gap_if_gap_lock( heir_heap_no, lock->index, lock->trx); } } - - lock_mutex_exit(); } /** Moves the locks of a record to another record and resets the lock bits of @@ -2403,7 +2483,8 @@ static void lock_rec_move_low( { lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(receiver->get_page_id())); + ut_ad(locksys::owns_page_shard(donator->get_page_id())); /* If the lock is predicate lock, it resides on INFIMUM record */ ut_ad(lock_rec_get_first(lock_hash, receiver, receiver_heap_no) == nullptr || @@ -2492,109 +2573,97 @@ void lock_move_reorganize_page( UT_LIST_BASE_NODE_T(lock_t) old_locks; mem_heap_t *heap = nullptr; ulint comp; + { + /* We only process locks on block, not oblock */ + locksys::Shard_latch_guard guard{block->get_page_id()}; - lock_mutex_enter(); - - /* FIXME: This needs to deal with predicate lock too */ - lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); - - if (lock == nullptr) { - lock_mutex_exit(); - - return; - } - - heap = mem_heap_create(256); - - /* Copy first all the locks on the page to heap and reset the - bitmaps in the original locks; chain the copies of the locks - using the trx_locks field in them. */ + /* FIXME: This needs to deal with predicate lock too */ + lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); - UT_LIST_INIT(old_locks, &lock_t::trx_locks); + if (lock == nullptr) { + return; + } - do { - /* Make a copy of the lock */ - lock_t *old_lock = lock_rec_copy(lock, heap); + heap = mem_heap_create(256); - UT_LIST_ADD_LAST(old_locks, old_lock); + /* Copy first all the locks on the page to heap and reset the + bitmaps in the original locks; chain the copies of the locks + using the trx_locks field in them. */ - /* Reset bitmap of lock */ - lock_rec_bitmap_reset(lock); + UT_LIST_INIT(old_locks, &lock_t::trx_locks); - if (lock_get_wait(lock)) { - lock_reset_lock_and_trx_wait(lock); - } + do { + /* Make a copy of the lock */ + lock_t *old_lock = lock_rec_copy(lock, heap); - lock = lock_rec_get_next_on_page(lock); - } while (lock != nullptr); + UT_LIST_ADD_LAST(old_locks, old_lock); - comp = page_is_comp(block->frame); - ut_ad(comp == page_is_comp(oblock->frame)); + /* Reset bitmap of lock */ + lock_rec_bitmap_reset(lock); - lock_move_granted_locks_to_front(old_locks); + if (lock_get_wait(lock)) { + lock_reset_lock_and_trx_wait(lock); + } - DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize", - UT_LIST_REVERSE(old_locks);); + lock = lock_rec_get_next_on_page(lock); + } while (lock != nullptr); - for (lock = UT_LIST_GET_FIRST(old_locks); lock != nullptr; - lock = UT_LIST_GET_NEXT(trx_locks, lock)) { - /* NOTE: we copy also the locks set on the infimum and - supremum of the page; the infimum may carry locks if an - update of a record is occurring on the page, and its locks - were temporarily stored on the infimum */ - const rec_t *rec1 = page_get_infimum_rec(buf_block_get_frame(block)); - const rec_t *rec2 = page_get_infimum_rec(buf_block_get_frame(oblock)); - - /* Set locks according to old locks */ - for (;;) { - ulint old_heap_no; - ulint new_heap_no; + comp = page_is_comp(block->frame); + ut_ad(comp == page_is_comp(oblock->frame)); - if (comp) { - old_heap_no = rec_get_heap_no_new(rec2); - new_heap_no = rec_get_heap_no_new(rec1); + lock_move_granted_locks_to_front(old_locks); - rec1 = page_rec_get_next_low(rec1, true); - rec2 = page_rec_get_next_low(rec2, true); - } else { - old_heap_no = rec_get_heap_no_old(rec2); - new_heap_no = rec_get_heap_no_old(rec1); - ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize", + UT_LIST_REVERSE(old_locks);); - rec1 = page_rec_get_next_low(rec1, false); - rec2 = page_rec_get_next_low(rec2, false); - } + for (lock = UT_LIST_GET_FIRST(old_locks); lock != nullptr; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + /* NOTE: we copy also the locks set on the infimum and + supremum of the page; the infimum may carry locks if an + update of a record is occurring on the page, and its locks + were temporarily stored on the infimum */ + const rec_t *rec1 = page_get_infimum_rec(buf_block_get_frame(block)); + const rec_t *rec2 = page_get_infimum_rec(buf_block_get_frame(oblock)); + + /* Set locks according to old locks */ + for (;;) { + ulint old_heap_no; + ulint new_heap_no; + + if (comp) { + old_heap_no = rec_get_heap_no_new(rec2); + new_heap_no = rec_get_heap_no_new(rec1); + + rec1 = page_rec_get_next_low(rec1, true); + rec2 = page_rec_get_next_low(rec2, true); + } else { + old_heap_no = rec_get_heap_no_old(rec2); + new_heap_no = rec_get_heap_no_old(rec1); + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + + rec1 = page_rec_get_next_low(rec1, false); + rec2 = page_rec_get_next_low(rec2, false); + } - /* Clear the bit in old_lock. */ - if (old_heap_no < lock->rec_lock.n_bits && - lock_rec_reset_nth_bit(lock, old_heap_no)) { - /* NOTE that the old lock bitmap could be too - small for the new heap number! */ + /* Clear the bit in old_lock. */ + if (old_heap_no < lock->rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, old_heap_no)) { + /* NOTE that the old lock bitmap could be too + small for the new heap number! */ - lock_rec_add_to_queue(lock->type_mode, block, new_heap_no, lock->index, - lock->trx); - } + lock_rec_add_to_queue(lock->type_mode, block, new_heap_no, + lock->index, lock->trx); + } - if (new_heap_no == PAGE_HEAP_NO_SUPREMUM) { - ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM); - break; + if (new_heap_no == PAGE_HEAP_NO_SUPREMUM) { + ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } } - } - -#ifdef UNIV_DEBUG - { - ulint i = lock_rec_find_set_bit(lock); - /* Check that all locks were moved. */ - if (i != ULINT_UNDEFINED) { - ib::fatal(ER_IB_MSG_640) << "lock_move_reorganize_page(): " << i - << " not moved in " << (void *)lock; - } + ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED); } -#endif /* UNIV_DEBUG */ - } - - lock_mutex_exit(); + } /* Shard_latch_guard */ mem_heap_free(heap); @@ -2617,75 +2686,75 @@ void lock_move_rec_list_end( ut_ad(buf_block_get_frame(block) == page_align(rec)); ut_ad(comp == page_is_comp(buf_block_get_frame(new_block))); - lock_mutex_enter(); + { + locksys::Shard_latches_guard guard{*block, *new_block}; - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; - lock = lock_rec_get_next_on_page(lock)) { - const rec_t *rec1 = rec; - const rec_t *rec2; - const ulint type_mode = lock->type_mode; + for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + lock = lock_rec_get_next_on_page(lock)) { + const rec_t *rec1 = rec; + const rec_t *rec2; + const ulint type_mode = lock->type_mode; - if (comp) { - if (page_offset(rec1) == PAGE_NEW_INFIMUM) { - rec1 = page_rec_get_next_low(rec1, true); - } + if (comp) { + if (page_offset(rec1) == PAGE_NEW_INFIMUM) { + rec1 = page_rec_get_next_low(rec1, true); + } - rec2 = page_rec_get_next_low( - buf_block_get_frame(new_block) + PAGE_NEW_INFIMUM, true); - } else { - if (page_offset(rec1) == PAGE_OLD_INFIMUM) { - rec1 = page_rec_get_next_low(rec1, false); + rec2 = page_rec_get_next_low( + buf_block_get_frame(new_block) + PAGE_NEW_INFIMUM, true); + } else { + if (page_offset(rec1) == PAGE_OLD_INFIMUM) { + rec1 = page_rec_get_next_low(rec1, false); + } + + rec2 = page_rec_get_next_low( + buf_block_get_frame(new_block) + PAGE_OLD_INFIMUM, false); } - rec2 = page_rec_get_next_low( - buf_block_get_frame(new_block) + PAGE_OLD_INFIMUM, false); - } + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ - /* Copy lock requests on user records to new page and - reset the lock bits on the old */ + for (;;) { + ulint rec1_heap_no; + ulint rec2_heap_no; - for (;;) { - ulint rec1_heap_no; - ulint rec2_heap_no; + if (comp) { + rec1_heap_no = rec_get_heap_no_new(rec1); - if (comp) { - rec1_heap_no = rec_get_heap_no_new(rec1); + if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) { + break; + } - if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) { - break; - } + rec2_heap_no = rec_get_heap_no_new(rec2); + rec1 = page_rec_get_next_low(rec1, true); + rec2 = page_rec_get_next_low(rec2, true); + } else { + rec1_heap_no = rec_get_heap_no_old(rec1); - rec2_heap_no = rec_get_heap_no_new(rec2); - rec1 = page_rec_get_next_low(rec1, true); - rec2 = page_rec_get_next_low(rec2, true); - } else { - rec1_heap_no = rec_get_heap_no_old(rec1); + if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) { + break; + } - if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) { - break; - } + rec2_heap_no = rec_get_heap_no_old(rec2); - rec2_heap_no = rec_get_heap_no_old(rec2); + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); - ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + rec1 = page_rec_get_next_low(rec1, false); + rec2 = page_rec_get_next_low(rec2, false); + } - rec1 = page_rec_get_next_low(rec1, false); - rec2 = page_rec_get_next_low(rec2, false); - } + if (rec1_heap_no < lock->rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) { + if (type_mode & LOCK_WAIT) { + lock_reset_lock_and_trx_wait(lock); + } - if (rec1_heap_no < lock->rec_lock.n_bits && - lock_rec_reset_nth_bit(lock, rec1_heap_no)) { - if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, + lock->trx); } - - lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, - lock->trx); } } - } - - lock_mutex_exit(); + } /* Shard_latches_guard */ #ifdef UNIV_DEBUG_LOCK_VALIDATE ut_ad(lock_rec_validate_page(block)); @@ -2714,73 +2783,70 @@ void lock_move_rec_list_start(const buf_block_t *new_block, /*!< in: index page ut_ad(new_block->frame == page_align(old_end)); ut_ad(comp == page_rec_is_comp(old_end)); - lock_mutex_enter(); - - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; - lock = lock_rec_get_next_on_page(lock)) { - const rec_t *rec1; - const rec_t *rec2; - const ulint type_mode = lock->type_mode; + { + locksys::Shard_latches_guard guard{*block, *new_block}; - if (comp) { - rec1 = page_rec_get_next_low( - buf_block_get_frame(block) + PAGE_NEW_INFIMUM, true); - rec2 = page_rec_get_next_low(old_end, true); - } else { - rec1 = page_rec_get_next_low( - buf_block_get_frame(block) + PAGE_OLD_INFIMUM, false); - rec2 = page_rec_get_next_low(old_end, false); - } + for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + lock = lock_rec_get_next_on_page(lock)) { + const rec_t *rec1; + const rec_t *rec2; + const ulint type_mode = lock->type_mode; - /* Copy lock requests on user records to new page and - reset the lock bits on the old */ + if (comp) { + rec1 = page_rec_get_next_low( + buf_block_get_frame(block) + PAGE_NEW_INFIMUM, true); + rec2 = page_rec_get_next_low(old_end, true); + } else { + rec1 = page_rec_get_next_low( + buf_block_get_frame(block) + PAGE_OLD_INFIMUM, false); + rec2 = page_rec_get_next_low(old_end, false); + } - while (rec1 != rec) { - ulint rec1_heap_no; - ulint rec2_heap_no; + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ - if (comp) { - rec1_heap_no = rec_get_heap_no_new(rec1); - rec2_heap_no = rec_get_heap_no_new(rec2); + while (rec1 != rec) { + ulint rec1_heap_no; + ulint rec2_heap_no; - rec1 = page_rec_get_next_low(rec1, true); - rec2 = page_rec_get_next_low(rec2, true); - } else { - rec1_heap_no = rec_get_heap_no_old(rec1); - rec2_heap_no = rec_get_heap_no_old(rec2); + if (comp) { + rec1_heap_no = rec_get_heap_no_new(rec1); + rec2_heap_no = rec_get_heap_no_new(rec2); - ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + rec1 = page_rec_get_next_low(rec1, true); + rec2 = page_rec_get_next_low(rec2, true); + } else { + rec1_heap_no = rec_get_heap_no_old(rec1); + rec2_heap_no = rec_get_heap_no_old(rec2); - rec1 = page_rec_get_next_low(rec1, false); - rec2 = page_rec_get_next_low(rec2, false); - } + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); - if (rec1_heap_no < lock->rec_lock.n_bits && - lock_rec_reset_nth_bit(lock, rec1_heap_no)) { - if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + rec1 = page_rec_get_next_low(rec1, false); + rec2 = page_rec_get_next_low(rec2, false); } - lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, - lock->trx); + if (rec1_heap_no < lock->rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) { + if (type_mode & LOCK_WAIT) { + lock_reset_lock_and_trx_wait(lock); + } + + lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, + lock->trx); + } } - } #ifdef UNIV_DEBUG - if (page_rec_is_supremum(rec)) { - ulint i; + if (page_rec_is_supremum(rec)) { + ulint i; - for (i = PAGE_HEAP_NO_USER_LOW; i < lock_rec_get_n_bits(lock); i++) { - if (lock_rec_get_nth_bit(lock, i)) { - ib::fatal(ER_IB_MSG_641) << "lock_move_rec_list_start():" << i - << " not moved in " << (void *)lock; + for (i = PAGE_HEAP_NO_USER_LOW; i < lock_rec_get_n_bits(lock); i++) { + ut_a(!lock_rec_get_nth_bit(lock, i)); } } - } #endif /* UNIV_DEBUG */ - } - - lock_mutex_exit(); + } + } /* Shard_latches_guard */ #ifdef UNIV_DEBUG_LOCK_VALIDATE ut_ad(lock_rec_validate_page(block)); @@ -2809,53 +2875,53 @@ void lock_rtr_move_rec_list(const buf_block_t *new_block, /*!< in: index page to ut_ad(new_block->frame == page_align(rec_move[0].new_rec)); ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec)); - lock_mutex_enter(); + { + locksys::Shard_latches_guard guard{*new_block, *block}; - for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; - lock = lock_rec_get_next_on_page(lock)) { - ulint moved = 0; - const rec_t *rec1; - const rec_t *rec2; - const ulint type_mode = lock->type_mode; + for (lock = lock_rec_get_first_on_page(lock_sys->rec_hash, block); lock; + lock = lock_rec_get_next_on_page(lock)) { + ulint moved = 0; + const rec_t *rec1; + const rec_t *rec2; + const ulint type_mode = lock->type_mode; - /* Copy lock requests on user records to new page and - reset the lock bits on the old */ + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ - while (moved < num_move) { - ulint rec1_heap_no; - ulint rec2_heap_no; + while (moved < num_move) { + ulint rec1_heap_no; + ulint rec2_heap_no; - rec1 = rec_move[moved].old_rec; - rec2 = rec_move[moved].new_rec; + rec1 = rec_move[moved].old_rec; + rec2 = rec_move[moved].new_rec; - if (comp) { - rec1_heap_no = rec_get_heap_no_new(rec1); - rec2_heap_no = rec_get_heap_no_new(rec2); + if (comp) { + rec1_heap_no = rec_get_heap_no_new(rec1); + rec2_heap_no = rec_get_heap_no_new(rec2); - } else { - rec1_heap_no = rec_get_heap_no_old(rec1); - rec2_heap_no = rec_get_heap_no_old(rec2); + } else { + rec1_heap_no = rec_get_heap_no_old(rec1); + rec2_heap_no = rec_get_heap_no_old(rec2); - ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); - } - - if (rec1_heap_no < lock->rec_lock.n_bits && - lock_rec_reset_nth_bit(lock, rec1_heap_no)) { - if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); } - lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, - lock->trx); + if (rec1_heap_no < lock->rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) { + if (type_mode & LOCK_WAIT) { + lock_reset_lock_and_trx_wait(lock); + } - rec_move[moved].moved = true; - } + lock_rec_add_to_queue(type_mode, new_block, rec2_heap_no, lock->index, + lock->trx); - moved++; - } - } + rec_move[moved].moved = true; + } - lock_mutex_exit(); + moved++; + } + } + } /* Shard_latches_guard */ #ifdef UNIV_DEBUG_LOCK_VALIDATE ut_ad(lock_rec_validate_page(block)); @@ -2868,7 +2934,7 @@ void lock_update_split_right( { ulint heap_no = lock_get_min_heap_no(right_block); - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*left_block, *right_block}; /* Move the locks on the supremum of the left page to the supremum of the right page */ @@ -2881,8 +2947,6 @@ void lock_update_split_right( lock_rec_inherit_to_gap(left_block, right_block, PAGE_HEAP_NO_SUPREMUM, heap_no); - - lock_mutex_exit(); } /** Updates the lock table when a page is merged to the right. */ @@ -2897,7 +2961,7 @@ void lock_update_merge_right( index page which will be discarded */ { - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*left_block, *right_block}; /* Inherit the locks from the supremum of the left page to the original successor of infimum on the right page, to which the left @@ -2923,8 +2987,6 @@ void lock_update_merge_right( #endif /* UNIV_DEBUG */ lock_rec_free_all_from_discard_page(left_block); - - lock_mutex_exit(); } /** Updates the lock table when the root page is copied to another in @@ -2937,13 +2999,12 @@ void lock_update_root_raise( const buf_block_t *block, /*!< in: index page to which copied */ const buf_block_t *root) /*!< in: root page */ { - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*block, *root}; /* Move the locks on the supremum of the root to the supremum of block */ lock_rec_move(block, root, PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); - lock_mutex_exit(); } /** Updates the lock table when a page is copied to another and the original @@ -2954,15 +3015,13 @@ void lock_update_copy_and_discard( const buf_block_t *block) /*!< in: index page; NOT the root! */ { - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*new_block, *block}; /* Move the locks on the supremum of the old page to the supremum of new_page */ lock_rec_move(new_block, block, PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); lock_rec_free_all_from_discard_page(block); - - lock_mutex_exit(); } /** Updates the lock table when a page is split to the left. */ @@ -2972,15 +3031,13 @@ void lock_update_split_left( { ulint heap_no = lock_get_min_heap_no(right_block); - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*left_block, *right_block}; /* Inherit the locks to the supremum of the left page from the successor of the infimum on the right page */ lock_rec_inherit_to_gap(left_block, right_block, PAGE_HEAP_NO_SUPREMUM, heap_no); - - lock_mutex_exit(); } /** Updates the lock table when a page is merged to the left. */ @@ -2997,7 +3054,7 @@ void lock_update_merge_left( ut_ad(left_block->frame == page_align(orig_pred)); - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*left_block, *right_block}; left_next_rec = page_rec_get_next_const(orig_pred); @@ -3033,8 +3090,6 @@ void lock_update_merge_left( #endif /* UNIV_DEBUG */ lock_rec_free_all_from_discard_page(right_block); - - lock_mutex_exit(); } /** Resets the original locks on heir and replaces them with gap type locks @@ -3051,13 +3106,11 @@ void lock_rec_reset_and_inherit_gap_locks( ulint heap_no) /*!< in: heap_no of the donating record */ { - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*heir_block, *block}; lock_rec_reset_and_release_wait(heir_block, heir_heap_no); lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no); - - lock_mutex_exit(); } /** Updates the lock table when a page is discarded. */ @@ -3073,13 +3126,12 @@ void lock_update_discard( ulint heap_no; const page_t *page = block->frame; - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*heir_block, *block}; if (!lock_rec_get_first_on_page(lock_sys->rec_hash, block) && (!lock_rec_get_first_on_page(lock_sys->prdt_page_hash, block)) && (!lock_rec_get_first_on_page(lock_sys->prdt_hash, block))) { /* No locks exist on page, nothing to do */ - lock_mutex_exit(); return; } @@ -3114,8 +3166,6 @@ void lock_update_discard( } lock_rec_free_all_from_discard_page(block); - - lock_mutex_exit(); } /** Updates the lock table when a new user record is inserted. */ @@ -3161,7 +3211,7 @@ void lock_update_delete( next_heap_no = rec_get_heap_no_old(page + rec_get_next_offs(rec, false)); } - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; /* Let the next record inherit the locks from rec, in gap mode */ @@ -3170,8 +3220,6 @@ void lock_update_delete( /* Reset the lock bits on rec and release waiting transactions */ lock_rec_reset_and_release_wait(block, heap_no); - - lock_mutex_exit(); } /** Stores on the page infimum record the explicit locks of another record. @@ -3192,11 +3240,9 @@ void lock_rec_store_on_page_infimum( ut_ad(block->frame == page_align(rec)); - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no); - - lock_mutex_exit(); } /** Restores the state of explicit lock requests on a single record, where the @@ -3211,13 +3257,12 @@ void lock_rec_restore_from_page_infimum( state; lock bits are reset on the infimum */ { + DEBUG_SYNC_C("lock_rec_restore_from_page_infimum_will_latch"); ulint heap_no = page_rec_get_heap_no(rec); - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*block, *donator}; lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM); - - lock_mutex_exit(); } /*========================= TABLE LOCKS ==============================*/ @@ -3242,7 +3287,7 @@ lock_t *lock_table_create(dict_table_t *table, /*!< in/out: database table lock_t *lock; ut_ad(table && trx); - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_table_shard(*table)); ut_ad(trx_mutex_own(trx)); ut_ad(trx_can_be_handled_by_current_thread(trx)); @@ -3253,7 +3298,7 @@ lock_t *lock_table_create(dict_table_t *table, /*!< in/out: database table from the transaction lock heap. */ if (type_mode == LOCK_AUTO_INC) { lock = table->autoinc_lock; - + ut_ad(table->autoinc_trx == nullptr); table->autoinc_trx = trx; ib_vector_push(trx->lock.autoinc_locks, &lock); @@ -3264,9 +3309,9 @@ lock_t *lock_table_create(dict_table_t *table, /*!< in/out: database table lock = static_cast( mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock))); } - lock->type_mode = uint32_t(type_mode | LOCK_TABLE); lock->trx = trx; + ut_d(lock->m_seq = lock_sys->m_seq.fetch_add(1)); lock->tab_lock.table = table; @@ -3281,7 +3326,7 @@ lock_t *lock_table_create(dict_table_t *table, /*!< in/out: database table #endif /* HAVE_PSI_DATA_LOCK_INTERFACE */ #endif /* HAVE_PSI_THREAD_INTERFACE */ - UT_LIST_ADD_LAST(trx->lock.trx_locks, lock); + locksys::add_to_trx_locks(lock); ut_list_append(table->locks, lock, TableLockGetNode()); @@ -3304,7 +3349,6 @@ UNIV_INLINE void lock_table_pop_autoinc_locks( trx_t *trx) /*!< in/out: transaction that owns the AUTOINC locks */ { - ut_ad(lock_mutex_own()); /* We will access and modify trx->lock.autoinc_locks so we need trx->mutex */ ut_ad(trx_mutex_own(trx)); ut_ad(!ib_vector_is_empty(trx->lock.autoinc_locks)); @@ -3333,9 +3377,9 @@ void lock_table_remove_autoinc_lock( lock_t *autoinc_lock; lint i = ib_vector_size(trx->lock.autoinc_locks) - 1; - ut_ad(lock_mutex_own()); ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC); ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(locksys::owns_table_shard(*lock->tab_lock.table)); ut_ad(!ib_vector_is_empty(trx->lock.autoinc_locks)); /* With stored functions and procedures the user may drop @@ -3381,18 +3425,17 @@ void lock_table_remove_low(lock_t *lock) /*!< in/out: table lock */ trx_t *trx; dict_table_t *table; - ut_ad(lock_mutex_own()); - trx = lock->trx; /* We will modify trx->lock.trx_locks so we need trx->mutex */ ut_ad(trx_mutex_own(trx)); table = lock->tab_lock.table; + ut_ad(locksys::owns_table_shard(*table)); const auto lock_mode = lock_get_mode(lock); /* Remove the table from the transaction's AUTOINC vector, if the lock that is being released is an AUTOINC lock. */ if (lock_mode == LOCK_AUTO_INC) { - /* The table's AUTOINC lock can get transferred to - another transaction before we get here. */ + /* The table's AUTOINC lock could not be granted to us yet. */ + ut_ad(table->autoinc_trx == trx || lock->is_waiting()); if (table->autoinc_trx == trx) { table->autoinc_trx = nullptr; } @@ -3403,17 +3446,17 @@ void lock_table_remove_low(lock_t *lock) /*!< in/out: table lock */ We only store locks that were granted in the trx->autoinc_locks vector (see lock_table_create() - and lock_grant()). Therefore it can be empty and we - need to check for that. */ + and lock_grant()). */ - if (!lock_get_wait(lock) && !ib_vector_is_empty(trx->lock.autoinc_locks)) { + if (!lock_get_wait(lock)) { lock_table_remove_autoinc_lock(lock, trx); } } ut_a(0 < table->count_by_mode[lock_mode]); --table->count_by_mode[lock_mode]; - UT_LIST_REMOVE(trx->lock.trx_locks, lock); + locksys::remove_from_trx_locks(lock); + ut_list_remove(table->locks, lock, TableLockGetNode()); MONITOR_INC(MONITOR_TABLELOCK_REMOVED); @@ -3431,7 +3474,7 @@ static dberr_t lock_table_enqueue_waiting( { trx_t *trx; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_table_shard(*table)); ut_ad(!srv_read_only_mode); trx = thr_get_trx(thr); @@ -3491,7 +3534,7 @@ const lock_t *lock_table_other_has_incompatible( { const lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_table_shard(*table)); // According to lock_compatibility_matrix, an intention lock can wait only // for LOCK_S or LOCK_X. If there are no LOCK_S nor LOCK_X locks in the queue, @@ -3500,7 +3543,7 @@ const lock_t *lock_table_other_has_incompatible( // as then there are almost no LOCK_S nor LOCK_X, but many DML queries still // need to get an intention lock to perform their action - while this never // causes them to wait for a "data lock", it might cause them to wait for - // lock_sys->mutex if the operation takes Omega(n). + // lock_sys table shard latch for the duration of table lock queue operation. if ((mode == LOCK_IS || mode == LOCK_IX) && table->count_by_mode[LOCK_S] == 0 && table->count_by_mode[LOCK_X] == 0) { @@ -3568,7 +3611,8 @@ dberr_t lock_table(ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, (there is some long explanation starting with "How do we prevent crashes caused by ongoing operations...") lock_remove_recovered_trx_record_locks - (this seems to be used during recovery, and recovery is single-threaded) + (this seems to be used to remove locks of recovered transactions from + table being dropped, and recovered transactions shouldn't call lock_table) Also the InnoDB Memcached plugin causes a callchain: innodb_store -> innodb_conn_init -> innodb_api_begin -> innodb_cb_cursor_lock -> ib_cursor_set_lock_mode -> ib_cursor_lock -> ib_trx_lock_table_with_retry @@ -3594,7 +3638,7 @@ dberr_t lock_table(ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, trx_set_rw_mode(trx); } - lock_mutex_enter(); + locksys::Shard_latch_guard table_latch_guard{*table}; /* We have to check if the new lock is compatible with any locks other transactions have in the table lock queue. */ @@ -3619,7 +3663,6 @@ dberr_t lock_table(ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, err = DB_SUCCESS; } - lock_mutex_exit(); trx_mutex_exit(trx); ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT || err == DB_DEADLOCK); @@ -3635,9 +3678,7 @@ void lock_table_ix_resurrect(dict_table_t *table, /*!< in/out: table */ if (lock_table_has(trx, table, LOCK_IX)) { return; } - - lock_mutex_enter(); - + locksys::Shard_latch_guard table_latch_guard{*table}; /* We have to check if the new lock is compatible with any locks other transactions have in the table lock queue. */ @@ -3645,7 +3686,6 @@ void lock_table_ix_resurrect(dict_table_t *table, /*!< in/out: table */ trx_mutex_enter(trx); lock_table_create(table, LOCK_IX, trx); - lock_mutex_exit(); trx_mutex_exit(trx); } @@ -3663,10 +3703,10 @@ static const lock_t *lock_table_has_to_wait_in_queue( const dict_table_t *table; const lock_t *lock; - ut_ad(lock_mutex_own()); ut_ad(lock_get_wait(wait_lock)); table = wait_lock->tab_lock.table; + ut_ad(locksys::owns_table_shard(*table)); const auto mode = lock_get_mode(wait_lock); @@ -3722,10 +3762,10 @@ static void lock_table_dequeue( behind will get their lock requests granted, if they are now qualified to it */ { - ut_ad(lock_mutex_own()); /* This is needed for lock_table_remove_low(), but it's easier to understand the code if we assert it here as well */ ut_ad(trx_mutex_own(in_lock->trx)); + ut_ad(locksys::owns_table_shard(*in_lock->tab_lock.table)); ut_a(lock_get_type_low(in_lock) == LOCK_TABLE); const auto mode = lock_get_mode(in_lock); @@ -3743,7 +3783,7 @@ static void lock_table_dequeue( // as then there are almost no LOCK_S nor LOCK_X, but many DML queries still // need to get an intention lock to perform their action - while this never // causes them to wait for a "data lock", it might cause them to wait for - // lock_sys->mutex if the operation takes Omega(n) or even Omega(n^2) + // lock_sys table shard latch for the duration of table lock queue operation. if ((mode == LOCK_IS || mode == LOCK_IX) && table->count_by_mode[LOCK_S] == 0 && table->count_by_mode[LOCK_X] == 0) { return; @@ -3815,7 +3855,7 @@ dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, @param[in] lock Lock that was unlocked @param[in] heap_no Heap no within the page for the lock. */ static void lock_rec_release(lock_t *lock, ulint heap_no) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); ut_ad(!lock_get_wait(lock)); ut_ad(lock_get_type_low(lock) == LOCK_REC); ut_ad(lock_rec_get_nth_bit(lock, heap_no)); @@ -3838,47 +3878,47 @@ void lock_rec_unlock( const rec_t *rec, /*!< in: record */ lock_mode lock_mode) /*!< in: LOCK_S or LOCK_X */ { - ut_ad(!trx->lock.wait_lock); ut_ad(block->frame == page_align(rec)); ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); ut_ad(lock_mode == LOCK_S || lock_mode == LOCK_X); ulint heap_no = page_rec_get_heap_no(rec); - lock_mutex_enter(); - trx_mutex_enter(trx); + { + locksys::Shard_latch_guard guard{block->get_page_id()}; + trx_mutex_enter_first_of_two(trx); + ut_ad(!trx->lock.wait_lock); - lock_t *first_lock; + lock_t *first_lock; - first_lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); + first_lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); - /* Find the last lock with the same lock_mode and transaction - on the record. */ + /* Find the last lock with the same lock_mode and transaction + on the record. */ - for (auto lock = first_lock; lock != nullptr; - lock = lock_rec_get_next(heap_no, lock)) { - if (lock->trx == trx && lock_get_mode(lock) == lock_mode && - lock_rec_get_rec_not_gap(lock)) { + for (auto lock = first_lock; lock != nullptr; + lock = lock_rec_get_next(heap_no, lock)) { + if (lock->trx == trx && lock_get_mode(lock) == lock_mode && + lock_rec_get_rec_not_gap(lock)) { #ifdef UNIV_DEBUG - /* Since we actually found the first, not the last lock, lets check - that it is also the last one */ - for (auto lock2 = lock_rec_get_next(heap_no, lock); lock2 != nullptr; - lock2 = lock_rec_get_next(heap_no, lock2)) { - ut_ad(!(lock2->trx == trx && lock_get_mode(lock2) == lock_mode && - lock_rec_get_rec_not_gap(lock2))); - } + /* Since we actually found the first, not the last lock, lets check + that it is also the last one */ + for (auto lock2 = lock_rec_get_next(heap_no, lock); lock2 != nullptr; + lock2 = lock_rec_get_next(heap_no, lock2)) { + ut_ad(!(lock2->trx == trx && lock_get_mode(lock2) == lock_mode && + lock_rec_get_rec_not_gap(lock2))); + } #endif - lock_rec_release(lock, heap_no); + lock_rec_release(lock, heap_no); - lock_mutex_exit(); - trx_mutex_exit(trx); + trx_mutex_exit(trx); - return; + return; + } } - } - lock_mutex_exit(); - trx_mutex_exit(trx); + trx_mutex_exit(trx); + } /* Shard_latch_guard */ { size_t stmt_len; @@ -3940,20 +3980,164 @@ static void lock_release_read_lock(lock_t *lock, bool only_gap) { } } +namespace locksys { + +/** A helper function which solves a chicken-and-egg problem occurring when one +needs to iterate over trx's locks and perform some actions on them. Iterating +over this list requires trx->mutex (or exclusive global lock_sys latch), and +operating on a lock requires lock_sys latches, yet the latching order requires +lock_sys latches to be taken before trx->mutex. +One way around it is to use exclusive global lock_sys latch, which heavily +deteriorates concurrency. Another is to try to reacquire the latches in needed +order, veryfing that the list wasn't modified meanwhile. +This function performs following steps: +1. releases trx->mutex, +2. acquires proper lock_sys shard latch, +3. reaquires trx->mutex +4. executes f unless trx's locks list has changed +Before and after this function following should hold: +- the shared global lock_sys latch is held +- the trx->mutex is held +@param[in] trx the trx, locks of which we are interested in +@param[in] shard description of the shard we want to latch +@param[in] f the function to execute when the shard is latched +@return true if f was called, false if it couldn't be called because trx locks + have changed while relatching trx->mutex +*/ +template +static bool try_relatch_trx_and_shard_and_do(const trx_t *const trx, + const S &shard, F &&f) { + ut_ad(locksys::owns_shared_global_latch()); + ut_ad(trx_mutex_own(trx)); + + const auto expected_version = trx->lock.trx_locks_version; + trx_mutex_exit(trx); + DEBUG_SYNC_C("try_relatch_trx_and_shard_and_do_noted_expected_version"); + locksys::Shard_naked_latch_guard guard{shard}; + trx_mutex_enter_first_of_two(trx); + + /* Check that list was not modified while we were reacquiring latches */ + if (expected_version != trx->lock.trx_locks_version) { + /* Someone has modified the list while we were re-acquiring the latches so, + it is unsafe to operate on the lock. It might have been released, or maybe + even assigned to another transaction (in case of AUTOINC lock). More + importantly, we need to let know the caller that the list it is iterating + over has been modified, which affects next/prev pointers. */ + return false; + } + + std::forward(f)(); + return true; +} + +/** A helper function which solves a chicken-and-egg problem occurring when one +needs to iterate over trx's locks and perform some actions on them. Iterating +over this list requires trx->mutex (or exclusive global lock_sys latch), and +operating on a lock requires lock_sys latches, yet the latching order requires +lock_sys latches to be taken before trx->mutex. +One way around it is to use exclusive global lock_sys latch, which heavily +deteriorates concurrency. Another is to try to reacquire the latches in needed +order, veryfing that the list wasn't modified meanwhile. +This function performs following steps: +1. releases trx->mutex, +2. acquires proper lock_sys shard latch for given lock, +3. reaquires trx->mutex +4. executes f unless trx's locks list has changed +Before and after this function following should hold: +- the shared global lock_sys latch is held +- the trx->mutex is held +@param[in] lock the lock we are interested in +@param[in] f the function to execute when the shard is latched +@return true if f was called, false if it couldn't be called because trx locks + have changed while relatching trx->mutex +*/ +template +static bool try_relatch_trx_and_shard_and_do(const lock_t *lock, F &&f) { + if (lock_get_type_low(lock) == LOCK_REC) { + return try_relatch_trx_and_shard_and_do( + lock->trx, lock->rec_lock.get_page_id(), std::forward(f)); + } + + ut_ad(lock_get_type_low(lock) == LOCK_TABLE); + return try_relatch_trx_and_shard_and_do(lock->trx, *lock->tab_lock.table, + std::forward(f)); +} + +/** Tries to release read locks of a transaction without latching the whole +lock sys. This may fail, if there are many concurrent threads editing the +list of locks of this transaction (for example due to B-tree pages being +merged or split, or due to implicit-to-explicit conversion). +It is called during XA prepare to release locks early. +@param[in,out] trx transaction +@param[in] only_gap release only GAP locks +@return true if and only if it succeeded to do the job*/ +static bool try_release_read_locks_in_s_mode(trx_t *trx, bool only_gap) { + /* In order to access trx->lock.trx_locks safely we need to hold trx->mutex. + So, conceptually we'd love to hold trx->mutex while iterating through + trx->lock.trx_locks. + However the latching order only allows us to obtain trx->mutex AFTER any + lock_sys latch. + One way around this problem is to simply latch the whole lock_sys in exclusive + mode (which also prevents any changes to trx->lock.trx_locks), however this + impacts performance in appliers (TPS drops by up to 10%). + Here we use a different approach: + 1. we extract lock from the list when holding the trx->mutex, + 2. identify the shard of lock_sys it belongs to, + 3. store the current version of trx->lock.trx_locks + 4. release the trx->mutex, + 5. acquire the lock_sys shard's latch, + 6. and reacquire the trx->mutex, + 7. verify that the version of trx->lock.trx_locks has not changed + 8. and only then perform any action on the lock. + */ + ut_ad(trx_mutex_own(trx)); + ut_ad(locksys::owns_shared_global_latch()); + lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + + while (lock != nullptr) { + ut_ad(trx_mutex_own(trx)); + /* We didn't latch the lock_sys shard this `lock` is in, so we only read a + bare minimum set of information from the `lock`, such as the type, space, + page_no, and next pointer, which, as long as we hold trx->mutex, should be + immutable. + + Store the pointer to the next lock in the list, because in some cases we are + going to remove `lock` from the list, which clears the pointer to next lock + */ + auto next_lock = UT_LIST_GET_NEXT(trx_locks, lock); + if (lock_get_type_low(lock) == LOCK_REC) { + /* Following call temporarily releases trx->mutex */ + if (!try_relatch_trx_and_shard_and_do( + lock, [=]() { lock_release_read_lock(lock, only_gap); })) { + /* Someone has modified the list while we were re-acquiring the latches + so we need to start over again. */ + return false; + } + } + /* As we have verified that the version has not changed, it must be the case + that the next_lock is still the next lock as well */ + lock = next_lock; + } + return true; +} +} // namespace locksys + /** Release read locks of a transacion latching the whole lock-sys in -exclusive mode. +exclusive mode, which is a bit too expensive to do by default. It is called during XA prepare to release locks early. @param[in,out] trx transaction @param[in] only_gap release only GAP locks */ static void lock_trx_release_read_locks_in_x_mode(trx_t *trx, bool only_gap) { ut_ad(!trx_mutex_own(trx)); - lock_mutex_enter(); - trx_mutex_enter(trx); + /* We will iterate over locks from various shards. */ + locksys::Global_exclusive_latch_guard guard{}; + trx_mutex_enter_first_of_two(trx); lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); while (lock != nullptr) { + DEBUG_SYNC_C("lock_trx_release_read_locks_in_x_mode_will_release"); /* Store the pointer to the next lock in the list, because in some cases we are going to remove `lock` from the list, which clears the pointer to next lock */ @@ -3964,26 +4148,32 @@ static void lock_trx_release_read_locks_in_x_mode(trx_t *trx, bool only_gap) { lock = next_lock; } - lock_mutex_exit(); trx_mutex_exit(trx); } void lock_trx_release_read_locks(trx_t *trx, bool only_gap) { - /* Avoid taking lock_sys if trx didn't acquire any lock. - We do not hold trx->mutex nor lock_sys latch while checking the emptiness of - trx->lock.trx_locks, but this is OK, because even if other threads are - modifying this list in parallel, they do not change the emptiness of it: - implicit-to-explicit conversion only occurs if the trx already has a table - intention lock, B-tree modification related operations always first create - a copy of old lock before removing old lock, and removal of wait lock can not - happen since we are not waiting. */ ut_ad(trx_can_be_handled_by_current_thread(trx)); - ut_ad(trx->lock.wait_lock == nullptr); - if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) { - return; + + size_t failures; + const size_t MAX_FAILURES = 5; + + { + locksys::Global_shared_latch_guard shared_latch_guard{}; + trx_mutex_enter(trx); + ut_ad(trx->lock.wait_lock == nullptr); + + for (failures = 0; failures < MAX_FAILURES; ++failures) { + if (locksys::try_release_read_locks_in_s_mode(trx, only_gap)) { + break; + } + } + + trx_mutex_exit(trx); } - lock_trx_release_read_locks_in_x_mode(trx, only_gap); + if (failures == MAX_FAILURES) { + lock_trx_release_read_locks_in_x_mode(trx, only_gap); + } } /** Releases transaction locks, and releases possible other transactions waiting @@ -3991,16 +4181,12 @@ void lock_trx_release_read_locks(trx_t *trx, bool only_gap) { @param[in,out] trx transaction */ static void lock_release(trx_t *trx) { lock_t *lock; - - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); ut_ad(!trx_mutex_own(trx)); ut_ad(!trx->is_dd_trx); - /* Don't take lock_sys mutex if trx didn't acquire any lock. - We want to check if trx->lock.trx_lock is empty without holding trx->mutex - nor lock_sys->mutex. - In order to access trx->lock.trx_locks safely we should hold at least - trx->mutex. But: + locksys::Global_shared_latch_guard shared_latch_guard{}; + /* In order to access trx->lock.trx_locks safely we need to hold trx->mutex. The transaction is already in TRX_STATE_COMMITTED_IN_MEMORY state and is no longer referenced, so we are not afraid of implicit-to-explicit conversions, nor a cancellation of a wait_lock (we are running, not waiting). Still, there @@ -4008,29 +4194,37 @@ static void lock_release(trx_t *trx) { locks to be moved from one page to another, which at the low level means that a new lock is created (and added to trx->lock.trx_locks) and the old one is removed (also from trx->lock.trx_locks) in that specific order. - Actually, there is no situation in our code, where some other thread can - change the number of explicit locks from 0 to non-zero, or vice-versa. - Even the implicit-to-explicit conversion presumes that our trx holds at least - an explicit IX table lock (since it was allowed to modify the table). - Thus, if the only thing we want to do is comparing with zero, then there is - no real risk here. */ - if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) { - return; - } - - lock_mutex_enter(); + So, conceptually we'd love to hold trx->mutex while iterating through + trx->lock.trx_locks. + However the latching order only allows us to obtain trx->mutex AFTER any + lock_sys latch. One way around this problem is to simply latch the whole + lock_sys in exclusive mode (which also prevents any changes to + trx->lock.trx_locks), however this impacts performance (TPS drops on + sysbench {pareto,uniform}-2S-{128,1024}-usrs tests by 3% to 11%) Here we + use a different approach: + 1. we extract lock from the list when holding the trx->mutex, + 2. identify the shard of lock_sys it belongs to, + 3. release the trx->mutex, + 4. acquire the lock_sys shard's latch, + 5. and reacquire the trx->mutex, + 6. verify that the lock pointer is still in trx->lock.trx_locks (so it is + safe to access it), + 7. and only then perform any action on the lock. + */ trx_mutex_enter(trx); - for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks); lock != nullptr; - lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) { - if (lock_get_type_low(lock) == LOCK_REC) { - lock_rec_dequeue_from_page(lock); - } else { - lock_table_dequeue(lock); - } + ut_ad(trx->lock.wait_lock == nullptr); + while ((lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) != nullptr) { + /* Following call temporarily releases trx->mutex */ + locksys::try_relatch_trx_and_shard_and_do(lock, [=]() { + if (lock_get_type_low(lock) == LOCK_REC) { + lock_rec_dequeue_from_page(lock); + } else { + lock_table_dequeue(lock); + } + }); } - lock_mutex_exit(); trx_mutex_exit(trx); } @@ -4043,7 +4237,7 @@ static void lock_release(trx_t *trx) { static void lock_trx_table_locks_remove(const lock_t *lock_to_remove) { trx_t *trx = lock_to_remove->trx; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_table_shard(*lock_to_remove->tab_lock.table)); /* We will modify trx->lock.table_locks so we need trx->mutex */ ut_ad(trx_mutex_own(trx)); @@ -4075,7 +4269,7 @@ static void lock_remove_all_on_table_for_trx( /* This is used when we drop a table and indeed have exclusive lock_sys access. */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); /* We need trx->mutex to iterate over trx->lock.trx_lock and it is needed by lock_trx_table_locks_remove() and lock_table_remove_low() but we haven't acquired it yet. */ @@ -4112,10 +4306,9 @@ static ulint lock_remove_recovered_trx_record_locks( table itself */ { ut_a(table != nullptr); - /* This is used in recovery where indeed we hold an exclusive lock_sys latch, - which is needed as we are about to iterate over locks held by multiple - transactions while they might be operating. */ - ut_ad(lock_mutex_own()); + /* We need exclusive lock_sys latch, as we are about to iterate over locks + held by multiple transactions while they might be operating. */ + ut_ad(locksys::owns_exclusive_global_latch()); ulint n_recovered_trx = 0; @@ -4133,7 +4326,7 @@ static ulint lock_remove_recovered_trx_record_locks( acquired it yet. */ ut_ad(!trx_mutex_own(trx)); trx_mutex_enter(trx); - /* Because we are holding the lock_sys->mutex, + /* Because we are holding the exclusive global lock_sys latch, implicit locks cannot be converted to explicit ones while we are scanning the explicit locks. */ @@ -4186,7 +4379,8 @@ void lock_remove_all_on_table( { lock_t *lock; - lock_mutex_enter(); + /* We will iterate over locks (including record locks) from various shards */ + locksys::Global_exclusive_latch_guard guard{}; for (lock = UT_LIST_GET_FIRST(table->locks); lock != nullptr; /* No op */) { @@ -4233,8 +4427,6 @@ void lock_remove_all_on_table( lock_remove_recovered_trx_record_locks(table) == 0) { lock_sys->rollback_complete = true; } - - lock_mutex_exit(); } /*===================== VALIDATION AND DEBUGGING ====================*/ @@ -4243,8 +4435,9 @@ void lock_remove_all_on_table( static void lock_table_print(FILE *file, /*!< in: file where to print */ const lock_t *lock) /*!< in: table type lock */ { - ut_ad(lock_mutex_own()); ut_a(lock_get_type_low(lock) == LOCK_TABLE); + /* We actually hold exclusive latch here, but we require just the shard */ + ut_ad(locksys::owns_table_shard(*lock->tab_lock.table)); fputs("TABLE LOCK table ", file); ut_print_name(file, lock->trx, lock->tab_lock.table->name.m_name); @@ -4280,13 +4473,11 @@ static void lock_rec_print(FILE *file, /*!< in: file where to print */ space_id_t space; page_no_t page_no; mtr_t mtr; - mem_heap_t *heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint *offsets = offsets_; - rec_offs_init(offsets_); + Rec_offsets offsets; - ut_ad(lock_mutex_own()); ut_a(lock_get_type_low(lock) == LOCK_REC); + /* We actually hold exclusive latch here, but we require just the shard */ + ut_ad(locksys::owns_page_shard(lock->rec_lock.get_page_id())); space = lock->rec_lock.space; page_no = lock->rec_lock.page_no; @@ -4343,21 +4534,14 @@ static void lock_rec_print(FILE *file, /*!< in: file where to print */ rec = page_find_rec_with_heap_no(buf_block_get_frame(block), i); - offsets = - rec_get_offsets(rec, lock->index, offsets, ULINT_UNDEFINED, &heap); - putc(' ', file); - rec_print_new(file, rec, offsets); + rec_print_new(file, rec, offsets.compute(rec, lock->index)); } putc('\n', file); } mtr_commit(&mtr); - - if (heap) { - mem_heap_free(heap); - } } #ifdef UNIV_DEBUG @@ -4375,7 +4559,7 @@ static ulint lock_get_n_rec_locks(void) { ulint i; /* We need exclusive access to lock_sys to iterate over all buckets */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { const lock_t *lock; @@ -4392,25 +4576,8 @@ static ulint lock_get_n_rec_locks(void) { } #endif /* PRINT_NUM_OF_LOCK_STRUCTS */ -/** Prints info of locks for all transactions. - @return false if not able to obtain lock mutex - and exits without printing info */ -bool lock_print_info_summary( - FILE *file, /*!< in: file where to print */ - ibool nowait) /*!< in: whether to wait for the lock mutex */ -{ - /* if nowait is false, wait on the lock mutex, - otherwise return immediately if fail to obtain the - mutex. */ - if (!nowait) { - lock_mutex_enter(); - } else if (lock_mutex_enter_nowait()) { - fputs( - "FAIL TO OBTAIN LOCK MUTEX," - " SKIP LOCK INFO PRINTING\n", - file); - return (false); - } +void lock_print_info_summary(FILE *file) { + ut_ad(locksys::owns_exclusive_global_latch()); if (lock_deadlock_found) { fputs( @@ -4476,7 +4643,6 @@ bool lock_print_info_summary( fprintf(file, "Total number of lock structs in row lock hash table %lu\n", (ulong)lock_get_n_rec_locks()); #endif /* PRINT_NUM_OF_LOCK_STRUCTS */ - return (true); } /** Functor to print not-started transaction from the mysql_trx_list. */ @@ -4486,7 +4652,7 @@ struct PrintNotStarted { void operator()(const trx_t *trx) { /* We require exclusive access to lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx->in_mysql_trx_list); ut_ad(mutex_own(&trx_sys->mutex)); @@ -4515,9 +4681,10 @@ class TrxLockIterator { const lock_t *current(const trx_t *trx) const { lock_t *lock; ulint i = 0; - /* trx->lock.trx_locks is protected by trx->mutex and lock_sys mutex, and we - assume we have the exclusive latch on lock_sys here */ - ut_ad(lock_mutex_own()); + /* Writes to trx->lock.trx_locks are protected by trx->mutex combined with a + shared lock_sys global latch, and we assume we have the exclusive latch on + lock_sys here. */ + ut_ad(locksys::owns_exclusive_global_latch()); for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); lock != nullptr && i < m_index; lock = UT_LIST_GET_NEXT(trx_locks, lock), ++i) { @@ -4602,8 +4769,8 @@ class TrxListIterator { @param[in] trx transaction */ void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx) { /* We require exclusive lock_sys access so that trx->lock.wait_lock is - not being modified */ - ut_ad(lock_mutex_own()); + not being modified, and to access trx->lock.wait_started without trx->mutex.*/ + ut_ad(locksys::owns_exclusive_global_latch()); fprintf(file, "---"); trx_print_latched(file, trx, 600); @@ -4630,11 +4797,12 @@ void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx) { } } -/** Prints info of locks for a transaction. This function will release the - lock mutex and the trx_sys_t::mutex if the page was read from disk. - @return true if page was read from the tablespace */ -static bool lock_rec_fetch_page(const lock_t *lock) /*!< in: record lock */ -{ +/** Reads the page containing the record protected by the given lock. +This function will temporarily release the exclusive global latch and the +trx_sys_t::mutex if the page was read from disk. +@param[in] lock the record lock +@return true if a page was successfully read from the tablespace */ +static bool lock_rec_fetch_page(const lock_t *lock) { ut_ad(lock_get_type_low(lock) == LOCK_REC); space_id_t space_id = lock->rec_lock.space; @@ -4647,7 +4815,7 @@ static bool lock_rec_fetch_page(const lock_t *lock) /*!< in: record lock */ if (found) { mtr_t mtr; - lock_mutex_exit(); + locksys::Unsafe_global_latch_manipulator::exclusive_unlatch(); mutex_exit(&trx_sys->mutex); @@ -4665,7 +4833,7 @@ static bool lock_rec_fetch_page(const lock_t *lock) /*!< in: record lock */ fil_space_release(space); } - lock_mutex_enter(); + locksys::Unsafe_global_latch_manipulator::exclusive_latch(); mutex_enter(&trx_sys->mutex); @@ -4686,16 +4854,14 @@ static bool lock_trx_print_locks( { const lock_t *lock; /* We require exclusive access to lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); /* Iterate over the transaction's locks. */ while ((lock = iter.current(trx)) != nullptr) { if (lock_get_type_low(lock) == LOCK_REC) { if (load_block) { - /* Note: lock_rec_fetch_page() will - release both the lock mutex and the - trx_sys_t::mutex if it does a read - from disk. */ + /* Note: lock_rec_fetch_page() will release both the exclusive global + latch and the trx_sys_t::mutex if it does a read from disk. */ if (lock_rec_fetch_page(lock)) { /* We need to resync the @@ -4741,14 +4907,9 @@ static bool lock_trx_print_locks( return (true); } -/** Prints info of locks for each transaction. This function assumes that the - caller holds the lock mutex and more importantly it will release the lock - mutex on behalf of the caller. (This should be fixed in the future). */ -void lock_print_info_all_transactions( - FILE *file) /*!< in/out: file where to print */ -{ +void lock_print_info_all_transactions(FILE *file) { /* We require exclusive access to lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); @@ -4791,13 +4952,11 @@ void lock_print_info_all_transactions( TrxLockIterator &lock_iter = trx_iter.lock_iter(); if (!lock_trx_print_locks(file, trx, lock_iter, load_block)) { - /* Resync trx_iter, the trx_sys->mutex and - the lock mutex were released. A page was - successfully read in. We need to print its - contents on the next call to - lock_trx_print_locks(). On the next call to - lock_trx_print_locks() we should simply print - the contents of the page just read in.*/ + /* Resync trx_iter, the trx_sys->mutex and exclusive global latch were + temporarily released. A page was successfully read in. We need to print + its contents on the next call to lock_trx_print_locks(). On the next + call to lock_trx_print_locks() we should simply print the contents of + the page just read in.*/ load_block = false; continue; @@ -4811,10 +4970,7 @@ void lock_print_info_all_transactions( trx_iter.next(); } - lock_mutex_exit(); mutex_exit(&trx_sys->mutex); - - ut_ad(lock_validate()); } #ifdef UNIV_DEBUG @@ -4845,15 +5001,18 @@ static bool lock_table_queue_validate( { const lock_t *lock; - ut_ad(lock_mutex_own()); + /* We actually hold exclusive latch here, but we require just the shard */ + ut_ad(locksys::owns_table_shard(*table)); ut_ad(trx_sys_mutex_own()); for (lock = UT_LIST_GET_FIRST(table->locks); lock != nullptr; lock = UT_LIST_GET_NEXT(tab_lock.locks, lock)) { /* lock->trx->state cannot change from or to NOT_STARTED while we are holding the trx_sys->mutex. It may change - from ACTIVE to PREPARED, but it may not change to - COMMITTED, because we are holding the lock_sys->mutex. */ + from ACTIVE to PREPARED. It may become COMMITTED_IN_MEMORY even though we + hold trx_sys->mutex in case it has trx->id==0, but even in this case it + will not be freed until it can release the table lock, and we prevent + this by latching its shard. */ ut_ad(trx_assert_started(lock->trx)); if (!lock_get_wait(lock)) { @@ -4868,34 +5027,27 @@ static bool lock_table_queue_validate( return (true); } - +namespace locksys { /** Validates the lock queue on a single record. - @return true if ok */ -static bool lock_rec_queue_validate( - bool locked_lock_trx_sys, - /*!< in: if the caller holds - both the lock mutex and - trx_sys_t->lock. */ - const buf_block_t *block, /*!< in: buffer block containing rec */ - const rec_t *rec, /*!< in: record to look at */ - const dict_index_t *index, /*!< in: index, or NULL if not known */ - const ulint *offsets) /*!< in: rec_get_offsets(rec, index) */ -{ +@param[in] block buffer block containing rec +@param[in] rec record to look at +@param[in] index index, or NULL if not known +@param[in] offsets rec_get_offsets(rec, index) */ +static void rec_queue_validate_latched(const buf_block_t *block, + const rec_t *rec, + const dict_index_t *index, + const ulint *offsets) { + ut_ad(owns_page_shard(block->get_page_id())); + ut_ad(mutex_own(&trx_sys->mutex)); ut_a(rec); ut_a(block->frame == page_align(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); - ut_ad(lock_mutex_own() == locked_lock_trx_sys); ut_ad(!index || index->is_clustered() || !dict_index_is_online_ddl(index)); ulint heap_no = page_rec_get_heap_no(rec); RecID rec_id{block, heap_no}; - if (!locked_lock_trx_sys) { - lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); - } - if (!page_rec_is_user_rec(rec)) { Lock_iter::for_each(rec_id, [&](lock_t *lock) { ut_ad(!trx_is_ac_nl_ro(lock->trx)); @@ -4911,12 +5063,7 @@ static bool lock_rec_queue_validate( return (true); }); - if (!locked_lock_trx_sys) { - lock_mutex_exit(); - mutex_exit(&trx_sys->mutex); - } - - return (true); + return; } if (index == nullptr) { @@ -4926,13 +5073,13 @@ static bool lock_rec_queue_validate( trx_id_t trx_id; /* Unlike the non-debug code, this invariant can only succeed - if the check and assertion are covered by the lock mutex. */ + if the check and assertion are covered by the lock_sys latch. */ trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); const trx_t *impl_trx = trx_rw_is_active_low(trx_id, nullptr); if (impl_trx != nullptr) { - ut_ad(lock_mutex_own()); + ut_ad(owns_page_shard(block->get_page_id())); ut_ad(trx_sys_mutex_own()); /* impl_trx cannot become TRX_STATE_COMMITTED_IN_MEMORY nor removed from rw_trx_set until we release trx_sys->mutex, which means that currently all @@ -4983,15 +5130,37 @@ static bool lock_rec_queue_validate( return (true); }); +} - if (!locked_lock_trx_sys) { - lock_mutex_exit(); - - mutex_exit(&trx_sys->mutex); - } +/** Validates the lock queue on a single record. +@param[in] block buffer block containing rec +@param[in] rec record to look at +@param[in] index index, or NULL if not known +@param[in] offsets rec_get_offsets(rec, index) */ +static void rec_queue_latch_and_validate(const buf_block_t *block, + const rec_t *rec, + const dict_index_t *index, + const ulint *offsets) { + ut_ad(!owns_exclusive_global_latch()); + ut_ad(!mutex_own(&trx_sys->mutex)); + + Shard_latch_guard guard{block->get_page_id()}; + mutex_enter(&trx_sys->mutex); + rec_queue_validate_latched(block, rec, index, offsets); + mutex_exit(&trx_sys->mutex); +} - return (true); +/** Validates the lock queue on a single record. +@param[in] block buffer block containing rec +@param[in] rec record to look at +@param[in] index index, or NULL if not known */ +static void rec_queue_latch_and_validate(const buf_block_t *block, + const rec_t *rec, + const dict_index_t *index) { + rec_queue_latch_and_validate(block, rec, index, + Rec_offsets().compute(rec, index)); } +} // namespace locksys /** Validates the record lock queues on a page. @return true if ok */ @@ -5003,14 +5172,11 @@ static bool lock_rec_validate_page( ulint nth_lock = 0; ulint nth_bit = 0; ulint i; - mem_heap_t *heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint *offsets = offsets_; - rec_offs_init(offsets_); + Rec_offsets offsets; - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; mutex_enter(&trx_sys->mutex); loop: lock = lock_rec_get_first_on_page_addr( @@ -5037,15 +5203,14 @@ static bool lock_rec_validate_page( if (i == 1 || lock_rec_get_nth_bit(lock, i)) { rec = page_find_rec_with_heap_no(block->frame, i); ut_a(rec); - offsets = - rec_get_offsets(rec, lock->index, offsets, ULINT_UNDEFINED, &heap); /* If this thread is holding the file space latch (fil_space_t::latch), the following check WILL break the latching order and may cause a deadlock of threads. */ - lock_rec_queue_validate(true, block, rec, lock->index, offsets); + locksys::rec_queue_validate_latched(block, rec, lock->index, + offsets.compute(rec, lock->index)); nth_bit = i + 1; @@ -5059,12 +5224,8 @@ static bool lock_rec_validate_page( goto loop; function_exit: - lock_mutex_exit(); mutex_exit(&trx_sys->mutex); - if (heap != nullptr) { - mem_heap_free(heap); - } return (true); } @@ -5076,7 +5237,7 @@ static bool lock_validate_table_locks( const trx_t *trx; /* We need exclusive access to lock_sys to iterate over trxs' locks */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); ut_ad(trx_list == &trx_sys->rw_trx_list); @@ -5106,7 +5267,9 @@ static MY_ATTRIBUTE((warn_unused_result)) const lock_t *lock_rec_validate( uint64_t *limit) /*!< in/out: upper limit of (space, page_no) */ { - ut_ad(lock_mutex_own()); + /* Actually we only require to latch the start-th shard, but we happen to + hold exclusive latch here, which is easier to assert */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); for (const lock_t *lock = static_cast( @@ -5157,41 +5320,41 @@ static void lock_rec_block_validate(space_id_t space_id, page_no_t page_no) { } } -/** Validates the lock system. - @return true if ok */ -static bool lock_validate() { +bool lock_validate() { typedef std::pair page_addr_t; typedef std::set, ut_allocator> page_addr_set; page_addr_set pages; + { + /* lock_validate_table_locks() needs exclusive global latch, and we will + inspect record locks from all shards */ + locksys::Global_exclusive_latch_guard guard{}; + mutex_enter(&trx_sys->mutex); - lock_mutex_enter(); - mutex_enter(&trx_sys->mutex); - - ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list)); + ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list)); - /* Iterate over all the record locks and validate the locks. We - don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex. - Release both mutexes during the validation check. */ + /* Iterate over all the record locks and validate the locks. We + don't want to hog the lock_sys global latch and the trx_sys_t::mutex. + Thus we release both latches before the validation check. */ - for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { - const lock_t *lock; - uint64_t limit = 0; + for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + const lock_t *lock; + uint64_t limit = 0; - while ((lock = lock_rec_validate(i, &limit)) != nullptr) { - page_no_t page_no; - space_id_t space = lock->rec_lock.space; + while ((lock = lock_rec_validate(i, &limit)) != nullptr) { + page_no_t page_no; + space_id_t space = lock->rec_lock.space; - page_no = lock->rec_lock.page_no; + page_no = lock->rec_lock.page_no; - pages.insert(std::make_pair(space, page_no)); + pages.insert(std::make_pair(space, page_no)); + } } - } - mutex_exit(&trx_sys->mutex); - lock_mutex_exit(); + mutex_exit(&trx_sys->mutex); + } for (page_addr_set::const_iterator it = pages.begin(); it != pages.end(); ++it) { @@ -5232,78 +5395,58 @@ dberr_t lock_rec_insert_check_and_lock( ut_ad(!index->table->is_temporary()); - dberr_t err; + dberr_t err = DB_SUCCESS; lock_t *lock; ibool inherit_in = *inherit; trx_t *trx = thr_get_trx(thr); const rec_t *next_rec = page_rec_get_next_const(rec); ulint heap_no = page_rec_get_heap_no(next_rec); - lock_mutex_enter(); - /* Because this code is invoked for a running transaction by - the thread that is serving the transaction, it is not necessary - to hold trx->mutex here. */ - - /* When inserting a record into an index, the table must be at - least IX-locked. When we are building an index, we would pass - BTR_NO_LOCKING_FLAG and skip the locking altogether. */ - ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - - lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); - - if (lock == nullptr) { - /* We optimize CPU time usage in the simplest case */ - - lock_mutex_exit(); - - if (inherit_in && !index->is_clustered()) { - /* Update the page max trx id field */ - page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, - mtr); - } - - *inherit = false; - - return (DB_SUCCESS); - } + { + locksys::Shard_latch_guard guard{block->get_page_id()}; - /* Spatial index does not use GAP lock protection. It uses - "predicate lock" to protect the "range" */ - if (dict_index_is_spatial(index)) { - return (DB_SUCCESS); - } + /* When inserting a record into an index, the table must be at + least IX-locked. When we are building an index, we would pass + BTR_NO_LOCKING_FLAG and skip the locking altogether. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - *inherit = true; + /* Spatial index does not use GAP lock protection. It uses + "predicate lock" to protect the "range" */ + ut_ad(!dict_index_is_spatial(index)); - /* If another transaction has an explicit lock request which locks - the gap, waiting or granted, on the successor, the insert has to wait. + lock = lock_rec_get_first(lock_sys->rec_hash, block, heap_no); - An exception is the case where the lock by the another transaction - is a gap type lock which it placed to wait for its turn to insert. We - do not consider that kind of a lock conflicting with our insert. This - eliminates an unnecessary deadlock which resulted when 2 transactions - had to wait for their insert. Both had waiting gap type lock requests - on the successor, which produced an unnecessary deadlock. */ + if (lock == nullptr) { + *inherit = false; + } else { + *inherit = true; - const ulint type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION; + /* If another transaction has an explicit lock request which locks + the gap, waiting or granted, on the successor, the insert has to wait. - const lock_t *wait_for = - lock_rec_other_has_conflicting(type_mode, block, heap_no, trx); + An exception is the case where the lock by the another transaction + is a gap type lock which it placed to wait for its turn to insert. We + do not consider that kind of a lock conflicting with our insert. This + eliminates an unnecessary deadlock which resulted when 2 transactions + had to wait for their insert. Both had waiting gap type lock requests + on the successor, which produced an unnecessary deadlock. */ - if (wait_for != nullptr) { - RecLock rec_lock(thr, index, block, heap_no, type_mode); + const ulint type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION; - trx_mutex_enter(trx); + const lock_t *wait_for = + lock_rec_other_has_conflicting(type_mode, block, heap_no, trx); - err = rec_lock.add_to_waitq(wait_for); + if (wait_for != nullptr) { + RecLock rec_lock(thr, index, block, heap_no, type_mode); - trx_mutex_exit(trx); + trx_mutex_enter(trx); - } else { - err = DB_SUCCESS; - } + err = rec_lock.add_to_waitq(wait_for); - lock_mutex_exit(); + trx_mutex_exit(trx); + } + } + } /* Shard_latch_guard */ switch (err) { case DB_SUCCESS_LOCKED_REC: @@ -5322,24 +5465,8 @@ dberr_t lock_rec_insert_check_and_lock( break; } -#ifdef UNIV_DEBUG - { - mem_heap_t *heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - const ulint *offsets; - rec_offs_init(offsets_); - - offsets = - rec_get_offsets(next_rec, index, offsets_, ULINT_UNDEFINED, &heap); - - ut_ad(lock_rec_queue_validate(false, block, next_rec, index, offsets)); - - if (heap != nullptr) { - mem_heap_free(heap); - } - } + ut_d(locksys::rec_queue_latch_and_validate(block, next_rec, index)); ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT || err == DB_DEADLOCK); -#endif /* UNIV_DEBUG */ return (err); } @@ -5359,55 +5486,45 @@ static void lock_rec_convert_impl_to_expl_for_trx( ut_ad(trx_is_referenced(trx)); DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); + { + locksys::Shard_latch_guard guard{block->get_page_id()}; + /* This trx->mutex acquisition here is not really needed. + Its purpose is to prevent a state transition between calls to trx_state_eq() + and lock_rec_add_to_queue(). + But one can prove, that even if the state did change, it is not + a big problem, because we still keep reference count from dropping + to zero, so the trx object is still in use, and we hold the shard latched, + so trx can not release its explicit lock (if it has any) so we will + notice the explicit lock in lock_rec_has_expl. + On the other hand if trx does not have explicit lock, then we would create + one on its behalf, which is wasteful, but does not cause a problem, as once + the reference count drops to zero the trx will notice and remove this new + explicit lock. Also, even if some other trx had observed that trx is already + removed from rw trxs list and thus ignored the implicit lock and decided to + add its own lock, it will still have to wait for shard latch before adding + her lock. However it does not cost us much to simply take the trx->mutex + and avoid this whole shaky reasoning. */ + trx_mutex_enter(trx); - lock_mutex_enter(); - /* This trx->mutex acquisition here is not really needed. - Its purpose is to prevent a state transition between calls to trx_state_eq() - and lock_rec_add_to_queue(). - But one can prove, that even if the state did change, it is not - a big problem, because we still keep reference count from dropping - to zero, so the trx object is still in use, and we hold the lock mutex - so trx can not release its explicit lock (if it has any) so we will - notice the explicit lock in lock_rec_has_expl. - On the other hand if trx does not have explicit lock, then we would create one - on its behalf, which is wasteful, but does not cause a problem, as once the - reference count drops to zero the trx will notice and remove this new explicit - lock. - Also, even if some other trx had observed that trx is already removed from - rw trxs list and thus ignored the implicit lock and decided to add its own - lock, it will still have to wait for lock_mutex before adding her lock. - However it does not cost us much to simply take the trx->mutex - and avoid this whole shaky reasoning. */ - trx_mutex_enter(trx); + ut_ad(!index->is_clustered() || + trx->id == + lock_clust_rec_some_has_impl( + rec, index, + offsets ? offsets : Rec_offsets().compute(rec, index))); -#ifdef UNIV_DEBUG - if (index->is_clustered()) { - mem_heap_t *heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - if (!offsets) { - rec_offs_init(offsets_); - offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap); - } - auto implicit_owner_id = lock_clust_rec_some_has_impl(rec, index, offsets); - ut_a(implicit_owner_id == trx->id); - if (heap != nullptr) { - mem_heap_free(heap); - } - } -#endif - ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); - if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) && - !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, trx)) { - ulint type_mode; + if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) && + !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, trx)) { + ulint type_mode; - type_mode = (LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP); + type_mode = (LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP); - lock_rec_add_to_queue(type_mode, block, heap_no, index, trx, true); - } + lock_rec_add_to_queue(type_mode, block, heap_no, index, trx, true); + } - lock_mutex_exit(); - trx_mutex_exit(trx); + trx_mutex_exit(trx); + } trx_release_reference(trx); @@ -5425,7 +5542,7 @@ static void lock_rec_convert_impl_to_expl(const buf_block_t *block, const ulint *offsets) { trx_t *trx; - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); ut_ad(page_rec_is_user_rec(rec)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); @@ -5509,18 +5626,17 @@ dberr_t lock_clust_rec_modify_check_and_lock( lock_rec_convert_impl_to_expl(block, rec, index, offsets); - lock_mutex_enter(); - - ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - - err = lock_rec_lock(true, SELECT_ORDINARY, LOCK_X | LOCK_REC_NOT_GAP, block, - heap_no, index, thr); + { + locksys::Shard_latch_guard guard{block->get_page_id()}; + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + err = lock_rec_lock(true, SELECT_ORDINARY, LOCK_X | LOCK_REC_NOT_GAP, block, + heap_no, index, thr); - lock_mutex_exit(); + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + } - ut_ad(lock_rec_queue_validate(false, block, rec, index, offsets)); + ut_d(locksys::rec_queue_latch_and_validate(block, rec, index, offsets)); if (err == DB_SUCCESS_LOCKED_REC) { err = DB_SUCCESS; @@ -5564,34 +5680,18 @@ dberr_t lock_sec_rec_modify_check_and_lock( because when we come here, we already have modified the clustered index record, and this would not have been possible if another active transaction had modified this secondary index record. */ - - lock_mutex_enter(); - - ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - - err = lock_rec_lock(true, SELECT_ORDINARY, LOCK_X | LOCK_REC_NOT_GAP, block, - heap_no, index, thr); - - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); - - lock_mutex_exit(); - -#ifdef UNIV_DEBUG { - mem_heap_t *heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - const ulint *offsets; - rec_offs_init(offsets_); + locksys::Shard_latch_guard guard{block->get_page_id()}; - offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap); + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad(lock_rec_queue_validate(false, block, rec, index, offsets)); + err = lock_rec_lock(true, SELECT_ORDINARY, LOCK_X | LOCK_REC_NOT_GAP, block, + heap_no, index, thr); - if (heap != nullptr) { - mem_heap_free(heap); - } + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); } -#endif /* UNIV_DEBUG */ + + ut_d(locksys::rec_queue_latch_and_validate(block, rec, index)); if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) { /* Update the page max trx id field */ @@ -5635,27 +5735,26 @@ dberr_t lock_sec_rec_read_check_and_lock( !page_rec_is_supremum(rec)) { lock_rec_convert_impl_to_expl(block, rec, index, offsets); } + { + locksys::Shard_latch_guard guard{block->get_page_id()}; - lock_mutex_enter(); - - if (duration == lock_duration_t::AT_LEAST_STATEMENT) { - lock_protect_locks_till_statement_end(thr); - } - - ut_ad(mode != LOCK_X || - lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad(mode != LOCK_S || - lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + if (duration == lock_duration_t::AT_LEAST_STATEMENT) { + lock_protect_locks_till_statement_end(thr); + } - err = lock_rec_lock(false, sel_mode, mode | gap_mode, block, heap_no, index, - thr); + ut_ad(mode != LOCK_X || + lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S || + lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + err = lock_rec_lock(false, sel_mode, mode | gap_mode, block, heap_no, index, + thr); - lock_mutex_exit(); + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + } DEBUG_SYNC_C("lock_sec_rec_read_check_and_lock_has_locked"); - ut_ad(lock_rec_queue_validate(false, block, rec, index, offsets)); + ut_d(locksys::rec_queue_latch_and_validate(block, rec, index, offsets)); ut_ad(err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC || err == DB_LOCK_WAIT || err == DB_DEADLOCK || err == DB_SKIP_LOCKED || err == DB_LOCK_NOWAIT); @@ -5687,25 +5786,25 @@ dberr_t lock_clust_rec_read_check_and_lock( } DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock_impl_to_expl"); - lock_mutex_enter(); - - if (duration == lock_duration_t::AT_LEAST_STATEMENT) { - lock_protect_locks_till_statement_end(thr); - } + { + locksys::Shard_latch_guard guard{block->get_page_id()}; - ut_ad(mode != LOCK_X || - lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - ut_ad(mode != LOCK_S || - lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + if (duration == lock_duration_t::AT_LEAST_STATEMENT) { + lock_protect_locks_till_statement_end(thr); + } - err = lock_rec_lock(false, sel_mode, mode | gap_mode, block, heap_no, index, - thr); + ut_ad(mode != LOCK_X || + lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S || + lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); - MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + err = lock_rec_lock(false, sel_mode, mode | gap_mode, block, heap_no, index, + thr); - lock_mutex_exit(); + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + } - ut_ad(lock_rec_queue_validate(false, block, rec, index, offsets)); + ut_d(locksys::rec_queue_latch_and_validate(block, rec, index, offsets)); DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock"); ut_ad(err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC || @@ -5738,19 +5837,9 @@ dberr_t lock_clust_rec_read_check_and_lock_alt( LOCK_REC_NOT_GAP */ que_thr_t *thr) /*!< in: query thread */ { - mem_heap_t *tmp_heap = nullptr; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint *offsets = offsets_; - dberr_t err; - rec_offs_init(offsets_); - - offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &tmp_heap); - err = lock_clust_rec_read_check_and_lock(lock_duration_t::REGULAR, block, rec, - index, offsets, SELECT_ORDINARY, - mode, gap_mode, thr); - if (tmp_heap) { - mem_heap_free(tmp_heap); - } + dberr_t err = lock_clust_rec_read_check_and_lock( + lock_duration_t::REGULAR, block, rec, index, + Rec_offsets().compute(rec, index), SELECT_ORDINARY, mode, gap_mode, thr); if (err == DB_SUCCESS_LOCKED_REC) { err = DB_SUCCESS; @@ -5770,7 +5859,10 @@ void lock_release_autoinc_last_lock(trx_t *trx) { ut_ad(trx_mutex_own(trx)); ib_vector_t *autoinc_locks = trx->lock.autoinc_locks; - ut_ad(lock_mutex_own()); + /* Since we do not know for which table the trx has created the last lock + we can not narrow the required latch to any particular shard, and thus we + require exclusive access to lock_sys here */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_a(!ib_vector_is_empty(autoinc_locks)); /* The lock to be release must be the last lock acquired. */ @@ -5805,7 +5897,10 @@ static bool lock_trx_holds_autoinc_locks( /** Release all the transaction's autoinc locks. */ static void lock_release_autoinc_locks(trx_t *trx) /*!< in/out: transaction */ { - ut_ad(lock_mutex_own()); + /* Since we do not know for which table(s) the trx has created the lock(s) + we can not narrow the required latch to any particular shard, and thus we + require exclusive access to lock_sys here */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_mutex_own(trx)); ut_a(trx->lock.autoinc_locks != nullptr); @@ -5865,9 +5960,10 @@ void lock_get_psi_event(const lock_t *lock, ulonglong *thread_id, @return The first lock */ const lock_t *lock_get_first_trx_locks(const trx_lock_t *trx_lock) { - /* trx->lock.trx_locks is protected by trx->mutex and lock_sys mutex, and we - assume we have the exclusive latch on lock_sys here */ - ut_ad(lock_mutex_own()); + /* Writes to trx->lock.trx_locks are protected by trx->mutex combined with a + shared global lock_sys latch, and we assume we have the exclusive latch on + lock_sys here */ + ut_ad(locksys::owns_exclusive_global_latch()); const lock_t *result = UT_LIST_GET_FIRST(trx_lock->trx_locks); return (result); } @@ -5877,9 +5973,10 @@ const lock_t *lock_get_first_trx_locks(const trx_lock_t *trx_lock) { @return The next lock */ const lock_t *lock_get_next_trx_locks(const lock_t *lock) { - /* trx->lock.trx_locks is protected by trx->mutex and lock_sys mutex, and we - assume we have the exclusive latch on lock_sys here */ - ut_ad(lock_mutex_own()); + /* Writes to trx->lock.trx_locks are protected by trx->mutex combined with a + shared global lock_sys latch, and we assume we have the exclusive latch on + lock_sys here */ + ut_ad(locksys::owns_exclusive_global_latch()); const lock_t *result = UT_LIST_GET_NEXT(trx_locks, lock); return (result); } @@ -5899,10 +5996,9 @@ const lock_t *lock_get_next_trx_locks(const lock_t *lock) { @return lock mode */ const char *lock_get_mode_str(const lock_t *lock) /*!< in: lock */ { - /* We might need to modify lock_cached_lock_mode_names, so we need exclusive - access. Thankfully lock_get_mode_str is used only while holding the - lock_sys->mutex so we don't need dedicated mutex */ - ut_ad(lock_mutex_own()); + /* We use exclusive global lock_sys latch to protect the global + lock_cached_lock_mode_names mapping. */ + ut_ad(locksys::owns_exclusive_global_latch()); const auto type_mode = lock->type_mode; const auto mode = lock->mode(); @@ -6044,7 +6140,24 @@ page_no_t lock_rec_get_page_no(const lock_t *lock) /*!< in: lock */ waiting behind it. @param[in,out] lock Waiting lock request */ void lock_cancel_waiting_and_release(lock_t *lock) { - ut_ad(lock_mutex_own()); + /* Requiring exclusive global latch serves several purposes here. + + 1. In case of table LOCK_TABLE we will call lock_release_autoinc_locks(), + which iterates over locks held by this transaction and it is not clear if + these locks are from the same table. Frankly it is not clear why we even + release all of them here (note that none of them is our `lock` because we + don't store waiting locks in the trx->autoinc_locks vector, only granted). + Perhaps this is because this trx is going to be rolled back anyway, and this + seemed to be a good moment to release them? + + 2. During lock_rec_dequeue_from_page() and lock_table_dequeue() we might latch + trx mutex of another transaction to grant it a lock. The rules meant to avoid + deadlocks between trx mutex require us to either use an exclusive global + latch, or to first latch trx which is has trx->lock.wait_lock == nullptr. + As `lock == lock->trx->lock.wait_lock` and thus is not nullptr, we have to use + the first approach, or complicate the proof of deadlock avoidance enormously. + */ + ut_ad(locksys::owns_exclusive_global_latch()); /* We will access lock->trx->lock.autoinc_locks which requires trx->mutex */ ut_ad(trx_mutex_own(lock->trx)); @@ -6054,7 +6167,6 @@ void lock_cancel_waiting_and_release(lock_t *lock) { ut_ad(lock_get_type_low(lock) & LOCK_TABLE); if (lock->trx->lock.autoinc_locks != nullptr) { - /* Release the transaction's AUTOINC locks. */ lock_release_autoinc_locks(lock->trx); } @@ -6069,9 +6181,8 @@ void lock_cancel_waiting_and_release(lock_t *lock) { connection thread that owns the transaction (trx->mysql_thd). */ void lock_unlock_table_autoinc(trx_t *trx) /*!< in/out: transaction */ { - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); ut_ad(!trx_mutex_own(trx)); - ut_ad(!trx->lock.wait_lock); /* This can be invoked on NOT_STARTED, ACTIVE, PREPARED, but not COMMITTED transactions. */ @@ -6104,14 +6215,17 @@ void lock_unlock_table_autoinc(trx_t *trx) /*!< in/out: transaction */ 2. trx->mutex is cheap */ trx_mutex_enter(trx); + ut_ad(!trx->lock.wait_lock); bool might_have_autoinc_locks = lock_trx_holds_autoinc_locks(trx); trx_mutex_exit(trx); if (might_have_autoinc_locks) { - lock_mutex_enter(); + /* lock_release_autoinc_locks() requires exclusive global latch as the + AUTOINC locks might be on tables from different shards. Identifying and + latching them in correct order would complicate this rarely-taken path. */ + locksys::Global_exclusive_latch_guard guard{}; trx_mutex_enter(trx); lock_release_autoinc_locks(trx); - lock_mutex_exit(); trx_mutex_exit(trx); } } @@ -6195,7 +6309,10 @@ dberr_t lock_trx_handle_wait(trx_t *trx) /*!< in/out: trx lock state */ { dberr_t err; - lock_mutex_enter(); + /* lock_cancel_waiting_and_release() requires exclusive global latch, and so + does reading the trx->lock.wait_lock to prevent races with B-tree page + reorganization */ + locksys::Global_exclusive_latch_guard guard{}; trx_mutex_enter(trx); @@ -6209,7 +6326,6 @@ dberr_t lock_trx_handle_wait(trx_t *trx) /*!< in/out: trx lock state */ err = DB_SUCCESS; } - lock_mutex_exit(); trx_mutex_exit(trx); return (err); @@ -6228,7 +6344,11 @@ static const lock_t *lock_table_locks_lookup( const trx_t *trx; ut_a(table != nullptr); - ut_ad(lock_mutex_own()); + /* We are going to iterate over multiple transactions, so even though we know + which table we are looking for we can not narrow required latch to just the + shard which contains the table, because accessing trx->lock.trx_locks would be + unsafe */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); for (trx = UT_LIST_GET_FIRST(*trx_list); trx != nullptr; @@ -6257,18 +6377,18 @@ static const lock_t *lock_table_locks_lookup( } #endif /* UNIV_DEBUG */ -/** Check if there are any locks (table or rec) against table. - @return true if table has either table or record locks. */ -bool lock_table_has_locks( - const dict_table_t *table) /*!< in: check if there are any locks - held on records in this table or on the - table itself */ -{ - ibool has_locks; +bool lock_table_has_locks(const dict_table_t *table) { + /** The n_rec_locks field might be modified by operation on any page shard, + so we need to latch everything. Note, that the results of this function will + be obsolete, as soon as we release the latch. It is called in contexts where + we believe that the number of locks should either be zero or decreasing. For + such scenario of usage, we might perhaps read the n_rec_locks without latch + and restrict latch just to a table shard. But that would complicate the debug + version of the code for no significant gain as this is not a hot path. */ + locksys::Global_exclusive_latch_guard guard{}; - lock_mutex_enter(); - - has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0; + bool has_locks = + UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks.load() > 0; #ifdef UNIV_DEBUG if (!has_locks) { @@ -6280,8 +6400,6 @@ bool lock_table_has_locks( } #endif /* UNIV_DEBUG */ - lock_mutex_exit(); - return (has_locks); } @@ -6309,11 +6427,10 @@ bool lock_trx_has_rec_x_lock(que_thr_t *thr, const dict_table_t *table, ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM); const trx_t *trx = thr_get_trx(thr); - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; ut_a(lock_table_has(trx, table, LOCK_IX) || table->is_temporary()); ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, trx) || table->is_temporary()); - lock_mutex_exit(); return (true); } #endif /* UNIV_DEBUG */ @@ -6324,7 +6441,7 @@ is enabled. */ void Deadlock_notifier::start_print() { /* I/O operations on lock_latest_err_file require exclusive latch on lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); rewind(lock_latest_err_file); ut_print_timestamp(lock_latest_err_file); @@ -6340,7 +6457,7 @@ void Deadlock_notifier::start_print() { void Deadlock_notifier::print(const char *msg) { /* I/O operations on lock_latest_err_file require exclusive latch on lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); fputs(msg, lock_latest_err_file); if (srv_print_all_deadlocks) { @@ -6357,7 +6474,7 @@ void Deadlock_notifier::print(const trx_t *trx, ulint max_query_len) { 2. lock_number_of_rows_locked() 3. Accessing trx->lock fields requires either holding trx->mutex or latching the lock sys. */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); trx_mutex_enter(trx); ulint n_rec_locks = lock_number_of_rows_locked(&trx->lock); @@ -6383,7 +6500,7 @@ void Deadlock_notifier::print(const trx_t *trx, ulint max_query_len) { void Deadlock_notifier::print(const lock_t *lock) { /* I/O operations on lock_latest_err_file require exclusive latch on lock_sys. */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); if (lock_get_type_low(lock) == LOCK_REC) { lock_rec_print(lock_latest_err_file, lock); @@ -6403,7 +6520,7 @@ void Deadlock_notifier::print(const lock_t *lock) { void Deadlock_notifier::print_title(size_t pos_on_cycle, const char *title) { /* I/O operations on lock_latest_err_file require exclusive latch on lock_sys */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); ut::ostringstream buff; buff << "\n*** (" << (pos_on_cycle + 1) << ") " << title << ":\n"; print(buff.str().c_str()); @@ -6411,7 +6528,7 @@ void Deadlock_notifier::print_title(size_t pos_on_cycle, const char *title) { void Deadlock_notifier::notify(const ut::vector &trxs_on_cycle, const trx_t *victim_trx) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); start_print(); const auto n = trxs_on_cycle.size(); diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc index 684bfead7870..9d0700c1cbf3 100644 --- a/storage/innobase/lock/lock0prdt.cc +++ b/storage/innobase/lock/lock0prdt.cc @@ -223,7 +223,7 @@ lock_t *lock_prdt_has_lock(ulint precise_mode, /*!< in: LOCK_S or LOCK_X */ { lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S || (precise_mode & LOCK_MODE_MASK) == LOCK_X); ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); @@ -271,7 +271,7 @@ static const lock_t *lock_prdt_other_has_conflicting( the new lock will be on */ const trx_t *trx) /*!< in: our transaction */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); for (const lock_t *lock = lock_rec_get_first(lock_hash_get(mode), block, PRDT_HEAPNO); @@ -350,7 +350,7 @@ static lock_t *lock_prdt_find_on_page( { lock_t *lock; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); for (lock = lock_rec_get_first_on_page(lock_hash_get(type_mode), block); lock != nullptr; lock = lock_rec_get_next_on_page(lock)) { @@ -384,7 +384,7 @@ static lock_t *lock_prdt_add_to_queue( lock_prdt_t *prdt) /*!< in: Minimum Bounding Rectangle the new lock will be on */ { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); ut_ad(!index->is_clustered() && !dict_index_is_online_ddl(index)); ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); ut_ad(!trx_mutex_own(trx)); @@ -451,65 +451,54 @@ dberr_t lock_prdt_insert_check_and_lock( trx_t *trx = thr_get_trx(thr); - lock_mutex_enter(); - - /* Because this code is invoked for a running transaction by - the thread that is serving the transaction, it is not necessary - to hold trx->mutex here. */ - - ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - - lock_t *lock; - - /* Only need to check locks on prdt_hash */ - lock = lock_rec_get_first(lock_sys->prdt_hash, block, PRDT_HEAPNO); + dberr_t err = DB_SUCCESS; + { + locksys::Shard_latch_guard guard{block->get_page_id()}; - if (lock == nullptr) { - lock_mutex_exit(); + /* Because this code is invoked for a running transaction by + the thread that is serving the transaction, it is not necessary + to hold trx->mutex here. */ - /* Update the page max trx id field */ - page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr); + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - return (DB_SUCCESS); - } + lock_t *lock; - ut_ad(lock->type_mode & LOCK_PREDICATE); + /* Only need to check locks on prdt_hash */ + lock = lock_rec_get_first(lock_sys->prdt_hash, block, PRDT_HEAPNO); - dberr_t err; + if (lock != nullptr) { + ut_ad(lock->type_mode & LOCK_PREDICATE); - /* If another transaction has an explicit lock request which locks - the predicate, waiting or granted, on the successor, the insert - has to wait. + /* If another transaction has an explicit lock request which locks + the predicate, waiting or granted, on the successor, the insert + has to wait. - Similar to GAP lock, we do not consider lock from inserts conflicts - with each other */ + Similar to GAP lock, we do not consider lock from inserts conflicts + with each other */ - const ulint mode = LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION; + const ulint mode = LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION; - const lock_t *wait_for = - lock_prdt_other_has_conflicting(mode, block, prdt, trx); + const lock_t *wait_for = + lock_prdt_other_has_conflicting(mode, block, prdt, trx); - if (wait_for != nullptr) { - rtr_mbr_t *mbr = prdt_get_mbr_from_prdt(prdt); + if (wait_for != nullptr) { + rtr_mbr_t *mbr = prdt_get_mbr_from_prdt(prdt); - trx_mutex_enter(trx); + trx_mutex_enter(trx); - /* Allocate MBR on the lock heap */ - lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap); + /* Allocate MBR on the lock heap */ + lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap); - RecLock rec_lock(thr, index, block, PRDT_HEAPNO, mode); + RecLock rec_lock(thr, index, block, PRDT_HEAPNO, mode); - /* Note that we may get DB_SUCCESS also here! */ + /* Note that we may get DB_SUCCESS also here! */ - err = rec_lock.add_to_waitq(wait_for, prdt); + err = rec_lock.add_to_waitq(wait_for, prdt); - trx_mutex_exit(trx); - - } else { - err = DB_SUCCESS; - } - - lock_mutex_exit(); + trx_mutex_exit(trx); + } + } + } // release block latch switch (err) { case DB_SUCCESS_LOCKED_REC: @@ -540,7 +529,9 @@ void lock_prdt_update_parent( { lock_t *lock; - lock_mutex_enter(); + /* We will operate on three blocks (left, right, parent). Latching their + shards without deadlock is easiest using exlusive global latch. */ + locksys::Global_exclusive_latch_guard guard{}; /* Get all locks in parent */ for (lock = @@ -576,8 +567,6 @@ void lock_prdt_update_parent( lock->trx, lock_prdt); } } - - lock_mutex_exit(); } /** Update predicate lock when page splits */ @@ -593,8 +582,7 @@ static void lock_prdt_update_split_low( { lock_t *lock; - lock_mutex_enter(); - + locksys::Shard_latches_guard guard{*block, *new_block}; for (lock = lock_rec_get_first_on_page_addr(lock_hash_get(type_mode), space, page_no); lock; lock = lock_rec_get_next_on_page(lock)) { @@ -639,8 +627,6 @@ static void lock_prdt_update_split_low( lock_prdt); } } - - lock_mutex_exit(); } /** Update predicate lock when page splits */ @@ -715,7 +701,7 @@ dberr_t lock_prdt_lock(buf_block_t *block, /*!< in/out: buffer block of rec */ index record, and this would not have been possible if another active transaction had modified this secondary index record. */ - lock_mutex_enter(); + locksys::Shard_latch_guard guard{block->get_page_id()}; const ulint prdt_mode = mode | type_mode; lock_t *lock = lock_rec_get_first_on_page(hash, block); @@ -765,8 +751,6 @@ dberr_t lock_prdt_lock(buf_block_t *block, /*!< in/out: buffer block of rec */ } } - lock_mutex_exit(); - if (status == LOCK_REC_SUCCESS_CREATED && type_mode == LOCK_PREDICATE) { /* Append the predicate in the lock record */ lock_prdt_set_prdt(lock, prdt); @@ -796,7 +780,8 @@ dberr_t lock_place_prdt_page_lock( index record, and this would not have been possible if another active transaction had modified this secondary index record. */ - lock_mutex_enter(); + RecID rec_id(page_id_t{space, page_no}, PRDT_HEAPNO); + locksys::Shard_latch_guard guard{rec_id.get_page_id()}; const lock_t *lock = lock_rec_get_first_on_page_addr(lock_sys->prdt_page_hash, space, page_no); @@ -820,7 +805,6 @@ dberr_t lock_place_prdt_page_lock( } if (lock == nullptr) { - RecID rec_id(space, page_no, PRDT_HEAPNO); RecLock rec_lock(index, rec_id, mode); trx_mutex_enter(trx); @@ -832,8 +816,6 @@ dberr_t lock_place_prdt_page_lock( #endif /* PRDT_DIAG */ } - lock_mutex_exit(); - return (DB_SUCCESS); } @@ -847,13 +829,11 @@ bool lock_test_prdt_page_lock(const trx_t *trx, space_id_t space, page_no_t page_no) { lock_t *lock; - lock_mutex_enter(); + locksys::Shard_latch_guard guard{page_id_t{space, page_no}}; lock = lock_rec_get_first_on_page_addr(lock_sys->prdt_page_hash, space, page_no); - lock_mutex_exit(); - return (lock == nullptr || trx == lock->trx); } @@ -871,7 +851,7 @@ void lock_prdt_rec_move( return; } - lock_mutex_enter(); + locksys::Shard_latches_guard guard{*receiver, *donator}; for (lock = lock_rec_get_first(lock_sys->prdt_hash, donator, PRDT_HEAPNO); lock != nullptr; lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { @@ -883,8 +863,6 @@ void lock_prdt_rec_move( lock_prdt_add_to_queue(type_mode, receiver, lock->index, lock->trx, lock_prdt); } - - lock_mutex_exit(); } /** Removes predicate lock objects set on an index page which is discarded. @@ -897,7 +875,7 @@ void lock_prdt_page_free_from_discard(const buf_block_t *block, space_id_t space; page_no_t page_no; - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_page_shard(block->get_page_id())); space = block->page.id.space(); page_no = block->page.id.page_no(); diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc index 40e5f9ccbb99..ba17c6242f4f 100644 --- a/storage/innobase/lock/lock0wait.cc +++ b/storage/innobase/lock/lock0wait.cc @@ -72,10 +72,10 @@ static void lock_wait_table_release_slot( #endif /* UNIV_DEBUG */ lock_wait_mutex_enter(); - /* We omit trx_mutex_enter and lock_mutex_enter here, because we are only + /* We omit trx_mutex_enter and a lock_sys latches here, because we are only going to touch thr->slot, which is a member used only by lock0wait.cc and is sufficiently protected by lock_wait_mutex. Yes, there are readers who read - the thr->slot holding only trx->mutex and lock_sys->mutex, but they do so, + the thr->slot holding only trx->mutex and a lock_sys latch, but they do so, when they are sure that we were not woken up yet, so our thread can't be here. See comments in lock_wait_release_thread_if_suspended() for more details. */ @@ -379,15 +379,22 @@ static void lock_wait_release_thread_if_suspended(que_thr_t *thr) { 2. the only call to os_event_set is in lock_wait_release_thread_if_suspended 3. calls to lock_wait_release_thread_if_suspended are always performed after a call to lock_reset_lock_and_trx_wait(lock), and the sequence of the two is - in a critical section guarded by lock_mutex_enter + in a critical section guarded by lock_sys latch for the shard containing the + waiting lock 4. the lock_reset_lock_and_trx_wait(lock) asserts that lock->trx->lock.wait_lock == lock and sets lock->trx->lock.wait_lock = NULL Together all this facts imply, that it is impossible for a single trx to be woken up twice (unless it got to sleep again) because doing so requires reseting wait_lock to NULL. - We now hold exclusive lock_sys latch. */ - ut_ad(lock_mutex_own()); + We now hold either an exclusive lock_sys latch, or just for the shard which + contains the lock which used to be trx->lock.wait_lock, but we can not assert + that because trx->lock.wait_lock is now NULL so we don't know for which shard + we hold the latch here. So, please imagine something like: + + ut_ad(locksys::owns_lock_shard(lock->trx->lock.wait_lock)); + */ + ut_ad(trx_mutex_own(trx)); /* We don't need the lock_wait_mutex here, because we know that the thread @@ -419,14 +426,10 @@ static void lock_wait_release_thread_if_suspended(que_thr_t *thr) { } void lock_reset_wait_and_release_thread_if_suspended(lock_t *lock) { - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_lock_shard(lock)); ut_ad(trx_mutex_own(lock->trx)); ut_ad(lock->trx->lock.wait_lock == lock); - /* Reset the wait flag and the back pointer to lock in trx */ - - lock_reset_lock_and_trx_wait(lock); - /* We clear blocking_trx here and not in lock_reset_lock_and_trx_wait(), as lock_reset_lock_and_trx_wait() is called also when the wait_lock is being moved from one page to another during B-tree reorganization, in which case @@ -434,24 +437,30 @@ void lock_reset_wait_and_release_thread_if_suspended(lock_t *lock) { and assigned to trx->lock.wait_lock, but the information about blocking trx is not so easy to restore, so it is easier to simply not clear blocking_trx until we are 100% sure that we want to wake up the trx, which is now. - Actually, clearing blocking_trx is not strictly required from correctness - perspective, it rather serves for: + Clearing blocking_trx helps with: 1. performance optimization, as lock_wait_snapshot_waiting_threads() can omit this trx when building wait-for-graph 2. debugging, as reseting blocking_trx makes it easier to spot it was not - properly set on subsequent waits. */ + properly set on subsequent waits. + 3. helping lock_make_trx_hit_list() notice that HP trx is no longer waiting + for a lock, so it can take a fast path */ lock->trx->lock.blocking_trx.store(nullptr); - /* We only release locks for which someone is waiting, and the trx which - decided to wait for the lock should have already set trx->lock.que_state to - TRX_QUE_LOCK_WAIT and called que_thr_stop() before releasing the lock-sys - latch. */ + /* We only release locks for which someone is waiting, and we posses a latch + on the shard in which the lock is stored, and the trx which decided to wait + for the lock should have already set trx->lock.que_state to TRX_QUE_LOCK_WAIT + and called que_thr_stop() before releasing the latch on this shard. */ ut_ad(lock->trx_que_state() == TRX_QUE_LOCK_WAIT); /* The following function releases the trx from lock wait */ que_thr_t *thr = que_thr_end_lock_wait(lock->trx); + /* Reset the wait flag and the back pointer to lock in trx. + It is important to call it only after we obtain lock->trx->mutex, because + trx_mutex_enter makes some assertions based on trx->lock.wait_lock value */ + lock_reset_lock_and_trx_wait(lock); + if (thr != nullptr) { lock_wait_release_thread_if_suspended(thr); } @@ -480,14 +489,14 @@ static void lock_wait_check_and_cancel( if (trx_is_interrupted(trx) || (slot->wait_timeout < 100000000 && (wait_time > (int64_t)slot->wait_timeout || wait_time < 0))) { - /* Timeout exceeded or a wrap-around in system - time counter: cancel the lock request queued - by the transaction and release possible - other transactions waiting behind; it is - possible that the lock has already been - granted: in that case do nothing */ - - lock_mutex_enter(); + /* Timeout exceeded or a wrap-around in system time counter: cancel the lock + request queued by the transaction and release possible other transactions + waiting behind; it is possible that the lock has already been granted: in + that case do nothing. + The lock_cancel_waiting_and_release() needs exclusive global latch. + Also, we need to latch the shard containing wait_lock to read the field and + access the lock itself. */ + locksys::Global_exclusive_latch_guard guard{}; trx_mutex_enter(trx); @@ -497,8 +506,6 @@ static void lock_wait_check_and_cancel( lock_cancel_waiting_and_release(trx->lock.wait_lock); } - lock_mutex_exit(); - trx_mutex_exit(trx); } } @@ -520,7 +527,7 @@ struct waiting_trx_info_t { sorting criterion which is based on trx only. We use the pointer address, as any deterministic rule without ties will do. */ bool operator<(const waiting_trx_info_t &a, const waiting_trx_info_t &b) { - return a.trx < b.trx; + return std::less{}(a.trx, b.trx); } /** Check all slots for user threads that are waiting on locks, and if they have @@ -531,8 +538,9 @@ static void lock_wait_check_slots_for_timeouts() { for (auto slot = lock_sys->waiting_threads; slot < lock_sys->last_slot; ++slot) { - /* We are doing a read without the lock mutex and/or the trx mutex. This is - OK because a slot can't be freed or reserved without the lock wait mutex. */ + /* We are doing a read without latching the lock_sys or the trx mutex. + This is OK, because a slot can't be freed or reserved without the lock wait + mutex. */ if (slot->in_use) { lock_wait_check_and_cancel(slot); } @@ -657,7 +665,10 @@ static void lock_wait_build_wait_for_graph( sort(infos.begin(), infos.end()); waiting_trx_info_t needle{}; for (uint from = 0; from < n; ++from) { - ut_ad(from == 0 || infos[from - 1].trx < infos[from].trx); + /* Assert that the order used by sort and lower_bound depends only on the + trx field, as this is the only one we will initialize in the needle. */ + ut_ad(from == 0 || + std::less{}(infos[from - 1].trx, infos[from].trx)); needle.trx = infos[from].waits_for; auto it = std::lower_bound(infos.begin(), infos.end(), needle); @@ -675,11 +686,13 @@ static void lock_wait_build_wait_for_graph( static void lock_wait_rollback_deadlock_victim(trx_t *chosen_victim) { ut_ad(!trx_mutex_own(chosen_victim)); /* The call to lock_cancel_waiting_and_release requires exclusive latch on - whole lock_sys in case of table locks.*/ - ut_ad(lock_mutex_own()); + whole lock_sys. + Also, we need to latch the shard containing wait_lock to read it and access + the lock itself.*/ + ut_ad(locksys::owns_exclusive_global_latch()); trx_mutex_enter(chosen_victim); chosen_victim->lock.was_chosen_as_deadlock_victim = true; - ut_a(chosen_victim->lock.wait_lock); + ut_a(chosen_victim->lock.wait_lock != nullptr); ut_a(chosen_victim->lock.que_state == TRX_QUE_LOCK_WAIT); lock_cancel_waiting_and_release(chosen_victim->lock.wait_lock); trx_mutex_exit(chosen_victim); @@ -905,7 +918,7 @@ static trx_t *lock_wait_choose_victim( on the whole lock_sys. In theory number of locks should not change while the transaction is waiting, but instead of proving that they can not wake up, it is easier to assert that we hold the mutex */ - ut_ad(lock_mutex_own()); + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(!cycle_ids.empty()); trx_t *chosen_victim = nullptr; auto sorted_trxs = lock_wait_order_for_choosing_victim(cycle_ids, infos); @@ -969,7 +982,8 @@ static bool lock_wait_trxs_are_still_in_slots( in it which form a deadlock cycle, checks if the transactions allegedly forming the deadlock have actually still wait for a lock, as opposed to being already notified about lock being granted or timeout, but still being present in the -slot. This is done by checking trx->lock.wait_lock under lock_sys mutex. +slot. This is done by checking trx->lock.wait_lock under exclusive global +lock_sys latch. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @@ -980,8 +994,9 @@ static bool lock_wait_trxs_are_still_waiting( ut_ad(lock_wait_mutex_own()); /* We are iterating over various transaction which may have locks in different tables/rows, thus we need exclusive latch on the whole lock_sys to make sure - no one will wake them up (say, a high priority trx could abort them) */ - ut_ad(lock_mutex_own()); + no one will wake them up (say, a high priority trx could abort them) or change + the wait_lock to NULL temporarily during B-tree page reorganization. */ + ut_ad(locksys::owns_exclusive_global_latch()); for (auto id : cycle_ids) { const auto trx = infos[id].trx; @@ -1146,7 +1161,7 @@ static bool lock_wait_check_candidate_cycle( ut::vector &cycle_ids, const ut::vector &infos, ut::vector &new_weights) { ut_ad(!lock_wait_mutex_own()); - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); lock_wait_mutex_enter(); /* We have released all mutexes after we have built the `infos` snapshot and @@ -1160,8 +1175,8 @@ static bool lock_wait_check_candidate_cycle( If it has not changed, then we know that the trx's pointer still points to the same trx as the trx is sleeping, and thus has not finished and wasn't freed. So, we start by first checking that the slots still contain the trxs we are - interested in. This requires lock_wait_mutex, but not lock_mutex. - */ + interested in. This requires lock_wait_mutex, but does not require the + exclusive global latch. */ if (!lock_wait_trxs_are_still_in_slots(cycle_ids, infos)) { lock_wait_mutex_exit(); return false; @@ -1182,11 +1197,10 @@ static bool lock_wait_check_candidate_cycle( situation by looking at trx->lock.wait_lock, as each call to lock_wait_release_thread_if_suspended() is performed only after lock_reset_lock_and_trx_wait() resets trx->lock.wait_lock to NULL. - Checking trx->lock.wait_lock must be done under lock_mutex. + Checking trx->lock.wait_lock in reliable way requires global exclusive latch. */ - lock_mutex_enter(); + locksys::Global_exclusive_latch_guard gurad{}; if (!lock_wait_trxs_are_still_waiting(cycle_ids, infos)) { - lock_mutex_exit(); lock_wait_mutex_exit(); return false; } @@ -1195,11 +1209,11 @@ static bool lock_wait_check_candidate_cycle( We can now release lock_wait_mutex, because: 1. we have verified that trx->lock.wait_lock is not NULL for cycle_ids - 2. we hold lock_sys->mutex - 3. lock_sys->mutex is required to change trx->lock.wait_lock to NULL + 2. we hold exclusive global lock_sys latch + 3. lock_sys latch is required to change trx->lock.wait_lock to NULL 4. only after changing trx->lock.wait_lock to NULL a trx can finish - So as long as we hold lock_sys->mutex we can access trxs. + So as long as we hold exclusive global lock_sys latch we can access trxs. */ lock_wait_mutex_exit(); @@ -1209,7 +1223,6 @@ static bool lock_wait_check_candidate_cycle( lock_wait_handle_deadlock(chosen_victim, cycle_ids, infos, new_weights); - lock_mutex_exit(); return true; } diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index dbf65dfba313..0c89cd1ee4bb 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -190,13 +190,11 @@ que_thr_t *que_thr_create(que_fork_t *parent, mem_heap_t *heap, que_thr_t *que_thr_end_lock_wait(trx_t *trx) /*!< in: transaction with que_state in QUE_THR_LOCK_WAIT */ { - que_thr_t *thr; - ibool was_active; + ut_ad(locksys::owns_lock_shard(trx->lock.wait_lock)); - ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(trx)); - thr = trx->lock.wait_thr; + que_thr_t *const thr = trx->lock.wait_thr; ut_ad(thr != nullptr); @@ -204,7 +202,7 @@ que_thr_t *que_thr_end_lock_wait(trx_t *trx) /*!< in: transaction with que_state /* In MySQL this is the only possible state here */ ut_a(thr->state == QUE_THR_LOCK_WAIT); - was_active = thr->is_active; + bool const was_active = thr->is_active; que_thr_move_to_run_state(thr); @@ -215,7 +213,7 @@ que_thr_t *que_thr_end_lock_wait(trx_t *trx) /*!< in: transaction with que_state /* In MySQL we let the OS thread (not just the query thread) to wait for the lock to be released: */ - return ((!was_active && thr != nullptr) ? thr : nullptr); + return !was_active ? thr : nullptr; } /** Inits a query thread for a command. */ @@ -591,16 +589,10 @@ static void que_thr_move_to_run_state( thr->state = QUE_THR_RUNNING; } -/** Stops a query thread if graph or trx is in a state requiring it. The - conditions are tested in the order (1) graph, (2) trx. - @return true if stopped */ -ibool que_thr_stop(que_thr_t *thr) /*!< in: query thread */ -{ - que_t *graph; +bool que_thr_stop(que_thr_t *thr) { + que_t *graph = thr->graph; trx_t *trx = thr_get_trx(thr); - graph = thr->graph; - ut_ad(trx_mutex_own(trx)); if (graph->state == QUE_FORK_COMMAND_WAIT) { @@ -620,10 +612,10 @@ ibool que_thr_stop(que_thr_t *thr) /*!< in: query thread */ } else { ut_ad(graph->state == QUE_FORK_ACTIVE); - return (FALSE); + return false; } - return (TRUE); + return true; } /** Decrements the query thread reference counts in the query graph and the diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 43dd6385099a..63e4af7c9e1e 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -701,11 +701,14 @@ static void row_ins_foreign_trx_print(trx_t *trx) /*!< in: transaction */ return; } - lock_mutex_enter(); - n_rec_locks = lock_number_of_rows_locked(&trx->lock); - n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); - heap_size = mem_heap_get_size(trx->lock.lock_heap); - lock_mutex_exit(); + { + /** lock_number_of_rows_locked() requires global exclusive latch, and so + does accessing trx_locks with trx->mutex */ + locksys::Global_exclusive_latch_guard guard{}; + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + } trx_sys_mutex_enter(); diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index c275637ede6c..dd7fb46f29d0 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -76,7 +76,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0rec.h" #include "trx0roll.h" #include "trx0undo.h" -#include "ut0mpmcbq.h" +#include "ut0cpu_cache.h" #include "ut0new.h" #include "current_thd.h" @@ -1153,8 +1153,11 @@ dberr_t row_lock_table_autoinc_for_mysql( ibool was_lock_wait; /* If we already hold an AUTOINC lock on the table then do nothing. - Note: We peek at the value of the current owner without acquiring - the lock mutex. */ + Note: We peek at the value of the current owner without acquiring any latch, + which is OK, because if the equality holds, it means we were granted the lock, + and the only way table->autoinc_trx can subsequently change is by releasing + the lock, which can not happen concurrently with the thread running the trx.*/ + ut_ad(trx_can_be_handled_by_current_thread(trx)); if (trx == table->autoinc_trx) { return (DB_SUCCESS); } @@ -3990,8 +3993,8 @@ dberr_t row_drop_table_for_mysql(const char *name, trx_t *trx, bool nonatomic, if (!table->is_intrinsic()) { lock_remove_all_on_table(table, TRUE); } - ut_a(table->n_rec_locks == 0); - } else if (table->get_ref_count() > 0 || table->n_rec_locks > 0) { + ut_a(table->n_rec_locks.load() == 0); + } else if (table->get_ref_count() > 0 || table->n_rec_locks.load() > 0) { ibool added; ut_ad(0); @@ -4405,8 +4408,7 @@ dberr_t row_mysql_parallel_select_count_star( Shards n_recs; Counter::clear(n_recs); - struct Check_interrupt { - byte m_pad[INNOBASE_CACHE_LINE_SIZE - (sizeof(size_t) + sizeof(void *))]; + struct alignas(ut::INNODB_CACHE_LINE_SIZE) Check_interrupt { size_t m_count{}; const buf_block_t *m_prev_block{}; }; diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index 9d7f2c34f920..511ed3c39111 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -284,20 +284,23 @@ static bool row_vers_find_matching( /** Finds out if an active transaction has inserted or modified a secondary index record. + @param[in] clust_rec clustered index record + @param[in] clust_index the clustered index + @param[in] sec_rec secondary index record + @param[in] sec_index the secondary index + @param[in] sec_offsets rec_get_offsets(sec_rec, sec_index) + @param[in,out] mtr mini-transaction @return 0 if committed, else the active transaction id; NOTE that this function can return false positives but never false - negatives. The caller must confirm all positive results by calling - trx_is_active() while holding lock_sys->mutex. */ + negatives. The caller must confirm all positive results by calling checking if + the trx is still active.*/ UNIV_INLINE -trx_t *row_vers_impl_x_locked_low( - const rec_t *const clust_rec, /*!< in: clustered index record */ - const dict_index_t *const clust_index, /*!< in: the clustered index */ - const rec_t *const sec_rec, /*!< in: secondary index record */ - const dict_index_t *const sec_index, /*!< in: the secondary index */ - const ulint - *const sec_offsets, /*!< in: rec_get_offsets(sec_rec, sec_index) */ - mtr_t *const mtr) /*!< in/out: mini-transaction */ -{ +trx_t *row_vers_impl_x_locked_low(const rec_t *const clust_rec, + const dict_index_t *const clust_index, + const rec_t *const sec_rec, + const dict_index_t *const sec_index, + const ulint *const sec_offsets, + mtr_t *const mtr) { trx_id_t trx_id; ibool corrupt; ulint comp; @@ -530,23 +533,14 @@ trx_t *row_vers_impl_x_locked_low( return trx; } -/** Finds out if an active transaction has inserted or modified a secondary - index record. - @return 0 if committed, else the active transaction id; - NOTE that this function can return false positives but never false - negatives. The caller must confirm all positive results by calling - trx_is_active() while holding lock_sys->mutex. */ -trx_t *row_vers_impl_x_locked( - const rec_t *rec, /*!< in: record in a secondary index */ - const dict_index_t *index, /*!< in: the secondary index */ - const ulint *offsets) /*!< in: rec_get_offsets(rec, index) */ -{ +trx_t *row_vers_impl_x_locked(const rec_t *rec, const dict_index_t *index, + const ulint *offsets) { mtr_t mtr; trx_t *trx; const rec_t *clust_rec; dict_index_t *clust_index; - ut_ad(!lock_mutex_own()); + ut_ad(!locksys::owns_exclusive_global_latch()); ut_ad(!trx_sys_mutex_own()); mtr_start(&mtr); diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index efb6e472be07..d2d87cbb37ac 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1256,18 +1256,30 @@ static void srv_refresh_innodb_monitor_stats(void) { mutex_exit(&srv_innodb_monitor_mutex); } -/** Outputs to a file the output of the InnoDB Monitor. - @return false if not all information printed - due to failure to obtain necessary mutex */ -ibool srv_printf_innodb_monitor( - FILE *file, /*!< in: output stream */ - ibool nowait, /*!< in: whether to wait for the - lock_sys_t:: mutex */ - ulint *trx_start_pos, /*!< out: file position of the start of - the list of active transactions */ - ulint *trx_end) /*!< out: file position of the end of - the list of active transactions */ -{ +/** +Prints info summary and info about all transactions to the file, recording the +position where the part about transactions starts. +@param[in] file output stream +@param[out] trx_start_pos file position of the start of the list of active + transactions +*/ +static void srv_printf_locks_and_transactions(FILE *file, + ulint *trx_start_pos) { + ut_ad(locksys::owns_exclusive_global_latch()); + lock_print_info_summary(file); + if (trx_start_pos) { + long t = ftell(file); + if (t < 0) { + *trx_start_pos = ULINT_UNDEFINED; + } else { + *trx_start_pos = (ulint)t; + } + } + lock_print_info_all_transactions(file); +} + +bool srv_printf_innodb_monitor(FILE *file, bool nowait, ulint *trx_start_pos, + ulint *trx_end) { ulint n_reserved; ibool ret; @@ -1325,27 +1337,22 @@ ibool srv_printf_innodb_monitor( mutex_exit(&dict_foreign_err_mutex); - /* Only if lock_print_info_summary proceeds correctly, - before we call the lock_print_info_all_transactions - to print all the lock information. IMPORTANT NOTE: This - function acquires the lock mutex on success. */ - ret = lock_print_info_summary(file, nowait); - - if (ret) { - if (trx_start_pos) { - long t = ftell(file); - if (t < 0) { - *trx_start_pos = ULINT_UNDEFINED; - } else { - *trx_start_pos = (ulint)t; - } + ret = true; + if (nowait) { + locksys::Global_exclusive_try_latch guard{}; + if (guard.owns_lock()) { + srv_printf_locks_and_transactions(file, trx_start_pos); + } else { + fputs("FAIL TO OBTAIN LOCK MUTEX, SKIP LOCK INFO PRINTING\n", file); + ret = false; } + } else { + locksys::Global_exclusive_latch_guard guard{}; + srv_printf_locks_and_transactions(file, trx_start_pos); + } - /* NOTE: If we get here then we have the lock mutex. This - function will release the lock mutex that we acquired when - we called the lock_print_info_summary() function earlier. */ - - lock_print_info_all_transactions(file); + if (ret) { + ut_ad(lock_validate()); if (trx_end) { long t = ftell(file); @@ -1687,7 +1694,7 @@ void srv_monitor_thread() { ib_time_monotonic_t current_time; ib_time_monotonic_t time_elapsed; ulint mutex_skipped; - ibool last_srv_print_monitor; + bool last_srv_print_monitor = srv_print_innodb_monitor; ut_ad(!srv_read_only_mode); @@ -1695,10 +1702,9 @@ void srv_monitor_thread() { srv_last_monitor_time = last_monitor_time; mutex_skipped = 0; - last_srv_print_monitor = srv_print_innodb_monitor; loop: /* Wake up every 5 seconds to see if we need to print - monitor information or if signalled at shutdown. */ + monitor information or if signaled at shutdown. */ sig_count = os_event_reset(srv_monitor_event); @@ -1712,14 +1718,13 @@ void srv_monitor_thread() { last_monitor_time = ut_time_monotonic(); if (srv_print_innodb_monitor) { - /* Reset mutex_skipped counter everytime - srv_print_innodb_monitor changes. This is to - ensure we will not be blocked by lock_sys->mutex - for short duration information printing, - such as requested by sync_array_print_long_waits() */ + /* Reset mutex_skipped counter every time srv_print_innodb_monitor + changes. This is to ensure we will not be blocked by lock_sys global latch + for short duration information printing, such as requested by + sync_array_print_long_waits() */ if (!last_srv_print_monitor) { mutex_skipped = 0; - last_srv_print_monitor = TRUE; + last_srv_print_monitor = true; } if (!srv_printf_innodb_monitor(stderr, MUTEX_NOWAIT(mutex_skipped), @@ -1730,7 +1735,7 @@ void srv_monitor_thread() { mutex_skipped = 0; } } else { - last_srv_print_monitor = FALSE; + last_srv_print_monitor = false; } /* We don't create the temp files or associated diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc index d4fd4f09cf3d..5c243345d9a9 100644 --- a/storage/innobase/sync/sync0debug.cc +++ b/storage/innobase/sync/sync0debug.cc @@ -138,7 +138,7 @@ struct LatchDebug { @return pointer to a thread's acquired latches. */ Latches *thread_latches(bool add = false) UNIV_NOTHROW; - /** Check that all the latches already owned by a thread have a lower + /** Check that all the latches already owned by a thread have a higher level than limit. @param[in] latches the thread's existing (acquired) latches @param[in] limit to check against @@ -441,11 +441,11 @@ LatchDebug::LatchDebug() { LEVEL_MAP_INSERT(SYNC_PAGE_CLEANER); LEVEL_MAP_INSERT(SYNC_PURGE_QUEUE); LEVEL_MAP_INSERT(SYNC_TRX_SYS_HEADER); - LEVEL_MAP_INSERT(SYNC_REC_LOCK); LEVEL_MAP_INSERT(SYNC_THREADS); LEVEL_MAP_INSERT(SYNC_TRX); LEVEL_MAP_INSERT(SYNC_TRX_SYS); - LEVEL_MAP_INSERT(SYNC_LOCK_SYS); + LEVEL_MAP_INSERT(SYNC_LOCK_SYS_GLOBAL); + LEVEL_MAP_INSERT(SYNC_LOCK_SYS_SHARDED); LEVEL_MAP_INSERT(SYNC_LOCK_WAIT_SYS); LEVEL_MAP_INSERT(SYNC_INDEX_ONLINE_LOG); LEVEL_MAP_INSERT(SYNC_IBUF_BITMAP); @@ -546,11 +546,6 @@ void LatchDebug::crash(const Latches *latches, const Latched *latched, ut_error; } -/** Check that all the latches already owned by a thread have a lower -level than limit. -@param[in] latches the thread's existing (acquired) latches -@param[in] limit to check against -@return latched info if there is one with a level <= limit . */ const Latched *LatchDebug::less(const Latches *latches, latch_level_t limit) const UNIV_NOTHROW { Latches::const_iterator end = latches->end(); @@ -565,6 +560,7 @@ const Latched *LatchDebug::less(const Latches *latches, } /** Do a basic ordering check. +Asserts that all the existing latches have a level higher than the in_level. @param[in] latches thread's existing latches @param[in] requested_level Level requested by latch @param[in] in_level declared ulint so that we can do level - 1. @@ -702,7 +698,7 @@ Latches *LatchDebug::check_order(const latch_t *latch, case SYNC_PAGE_ARCH_CLIENT: case SYNC_SEARCH_SYS: case SYNC_THREADS: - case SYNC_LOCK_SYS: + case SYNC_LOCK_SYS_GLOBAL: case SYNC_LOCK_WAIT_SYS: case SYNC_TRX_SYS: case SYNC_IBUF_BITMAP_MUTEX: @@ -755,12 +751,13 @@ Latches *LatchDebug::check_order(const latch_t *latch, case SYNC_TRX: - /* Either the thread must own the lock_sys->mutex, or - it is allowed to own only ONE trx_t::mutex. */ + /* Either the thread must own the lock_sys global latch, or + it is allowed to own only ONE trx_t::mutex. There are additional rules + for holding more than one trx_t::mutex @see trx_before_mutex_enter(). */ if (less(latches, level) != nullptr) { basic_check(latches, level, level - 1); - ut_a(find(latches, SYNC_LOCK_SYS) != nullptr); + ut_a(find(latches, SYNC_LOCK_SYS_GLOBAL) != nullptr); } break; @@ -774,6 +771,9 @@ Latches *LatchDebug::check_order(const latch_t *latch, case SYNC_BUF_ZIP_HASH: case SYNC_BUF_FLUSH_STATE: case SYNC_RSEG_ARRAY_HEADER: + case SYNC_LOCK_SYS_SHARDED: + case SYNC_BUF_PAGE_HASH: + case SYNC_BUF_BLOCK: /* We can have multiple mutexes of this type therefore we can only check whether the greater than condition holds. */ @@ -781,24 +781,6 @@ Latches *LatchDebug::check_order(const latch_t *latch, basic_check(latches, level, level - 1); break; - case SYNC_BUF_PAGE_HASH: - /* Fall through */ - case SYNC_BUF_BLOCK: - - if (less(latches, level) != nullptr) { - basic_check(latches, level, level - 1); - } - break; - - case SYNC_REC_LOCK: - - if (find(latches, SYNC_LOCK_SYS) != nullptr) { - basic_check(latches, level, SYNC_REC_LOCK - 1); - } else { - basic_check(latches, level, SYNC_REC_LOCK); - } - break; - case SYNC_IBUF_BITMAP: /* Either the thread must own the master mutex to all @@ -1391,7 +1373,11 @@ static void sync_latch_meta_init() UNIV_NOTHROW { LATCH_ADD_MUTEX(TRX, SYNC_TRX, trx_mutex_key); - LATCH_ADD_MUTEX(LOCK_SYS, SYNC_LOCK_SYS, lock_mutex_key); + LATCH_ADD_MUTEX(LOCK_SYS_PAGE, SYNC_LOCK_SYS_SHARDED, + lock_sys_page_mutex_key); + + LATCH_ADD_MUTEX(LOCK_SYS_TABLE, SYNC_LOCK_SYS_SHARDED, + lock_sys_table_mutex_key); LATCH_ADD_MUTEX(LOCK_SYS_WAIT, SYNC_LOCK_WAIT_SYS, lock_wait_mutex_key); @@ -1455,6 +1441,9 @@ static void sync_latch_meta_init() UNIV_NOTHROW { LATCH_ADD_RWLOCK(RSEGS, SYNC_RSEGS, rsegs_lock_key); + LATCH_ADD_RWLOCK(LOCK_SYS_GLOBAL, SYNC_LOCK_SYS_GLOBAL, + lock_sys_global_rw_lock_key); + LATCH_ADD_RWLOCK(UNDO_SPACES, SYNC_UNDO_SPACES, undo_spaces_lock_key); LATCH_ADD_MUTEX(UNDO_DDL, SYNC_UNDO_DDL, PFS_NOT_INSTRUMENTED); diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index c640a0dbf685..a3f853b6621c 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2020, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -122,7 +122,8 @@ mysql_pfs_key_t trx_mutex_key; mysql_pfs_key_t trx_pool_mutex_key; mysql_pfs_key_t trx_pool_manager_mutex_key; mysql_pfs_key_t temp_pool_manager_mutex_key; -mysql_pfs_key_t lock_mutex_key; +mysql_pfs_key_t lock_sys_table_mutex_key; +mysql_pfs_key_t lock_sys_page_mutex_key; mysql_pfs_key_t lock_wait_mutex_key; mysql_pfs_key_t trx_sys_mutex_key; mysql_pfs_key_t srv_sys_mutex_key; @@ -154,6 +155,7 @@ mysql_pfs_key_t buf_block_debug_latch_key; #endif /* UNIV_DEBUG */ mysql_pfs_key_t undo_spaces_lock_key; mysql_pfs_key_t rsegs_lock_key; +mysql_pfs_key_t lock_sys_global_rw_lock_key; mysql_pfs_key_t dict_operation_lock_key; mysql_pfs_key_t dict_table_stats_key; mysql_pfs_key_t hash_table_locks_key; diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc index 0fc9d3d2fbf6..9013822a5a9a 100644 --- a/storage/innobase/trx/trx0i_s.cc +++ b/storage/innobase/trx/trx0i_s.cc @@ -187,15 +187,16 @@ struct trx_i_s_cache_t { #define CACHE_STORAGE_INITIAL_SIZE 1024 /** Number of hash cells in the cache storage */ #define CACHE_STORAGE_HASH_CELLS 2048 - ha_storage_t *storage; /*!< storage for external volatile - data that may become unavailable - when we release - lock_sys->mutex or trx_sys->mutex */ - ulint mem_allocd; /*!< the amount of memory - allocated with mem_alloc*() */ - ibool is_truncated; /*!< this is TRUE if the memory - limit was hit and thus the data - in the cache is truncated */ + /** storage for external volatile data that may become unavailable when we + release exclusive global locksys latch or trx_sys->mutex */ + ha_storage_t *storage; + + /** the amount of memory allocated with mem_alloc*() */ + ulint mem_allocd; + + /** this is TRUE if the memory limit was hit and thus the data in the cache is + truncated */ + bool is_truncated; }; /** This is the intermediate buffer where data needed to fill the @@ -435,7 +436,11 @@ static ibool fill_trx_row( /* We are going to read various trx->lock fields protected by trx->mutex */ ut_ad(trx_mutex_own(trx)); - ut_ad(lock_mutex_own()); + /* We are going to read TRX_WEIGHT, lock_number_of_rows_locked() and + lock_number_of_tables_locked() which requires latching the lock_sys. + Also, we need it to avoid reading temporary NULL value set to wait_lock by a + B-tree page reorganization. */ + ut_ad(locksys::owns_exclusive_global_latch()); row->trx_id = trx_get_id_for_print(trx); row->trx_started = (ib_time_t)trx->start_time; @@ -645,12 +650,10 @@ void p_s_fill_lock_data(const char **lock_data, const lock_t *lock, const rec_t *rec; const dict_index_t *index; ulint n_fields; - mem_heap_t *heap; - ulint offsets_onstack[REC_OFFS_NORMAL_SIZE]; - ulint *offsets; char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; ulint buf_used; ulint i; + Rec_offsets rec_offsets; mtr_start(&mtr); @@ -667,9 +670,6 @@ void p_s_fill_lock_data(const char **lock_data, const lock_t *lock, page = reinterpret_cast(buf_block_get_frame(block)); - rec_offs_init(offsets_onstack); - offsets = offsets_onstack; - rec = page_find_rec_with_heap_no(page, heap_no); index = lock_rec_get_index(lock); @@ -678,8 +678,7 @@ void p_s_fill_lock_data(const char **lock_data, const lock_t *lock, ut_a(n_fields > 0); - heap = nullptr; - offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); + const ulint *offsets = rec_offsets.compute(rec, index); /* format and store the data */ @@ -692,14 +691,6 @@ void p_s_fill_lock_data(const char **lock_data, const lock_t *lock, *lock_data = container->cache_string(buf); - if (heap != nullptr) { - /* this means that rec_get_offsets() has created a new - heap and has stored offsets in it; check that this is - really the case and free the heap */ - ut_a(offsets != offsets_onstack); - mem_heap_free(heap); - } - mtr_commit(&mtr); } @@ -772,7 +763,11 @@ static ibool add_trx_relevant_locks_to_cache( requested lock row, or NULL or undefined */ { - ut_ad(lock_mutex_own()); + /* We are about to iterate over locks for various tables/rows so we can not + narrow the required latch to any specific shard, and thus require exclusive + access to lock_sys. This is also needed to avoid observing NULL temporarily + set to wait_lock during B-tree page reorganization. */ + ut_ad(locksys::owns_exclusive_global_latch()); /* If transaction is waiting we add the wait lock and all locks from another transactions that are blocking the wait lock. */ @@ -872,6 +867,9 @@ static void fetch_data_into_cache_low( transactions */ trx_ut_list_t *trx_list) /*!< in: trx list */ { + /* We are going to iterate over many different shards of lock_sys so we need + exclusive access */ + ut_ad(locksys::owns_exclusive_global_latch()); trx_t *trx; bool rw_trx_list = trx_list == &trx_sys->rw_trx_list; @@ -903,7 +901,7 @@ static void fetch_data_into_cache_low( ut_ad(trx->in_rw_trx_list == rw_trx_list); if (!add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row)) { - cache->is_truncated = TRUE; + cache->is_truncated = true; trx_mutex_exit(trx); return; } @@ -913,7 +911,7 @@ static void fetch_data_into_cache_low( /* memory could not be allocated */ if (trx_row == nullptr) { - cache->is_truncated = TRUE; + cache->is_truncated = true; trx_mutex_exit(trx); return; } @@ -921,7 +919,7 @@ static void fetch_data_into_cache_low( if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) { /* memory could not be allocated */ --cache->innodb_trx.rows_used; - cache->is_truncated = TRUE; + cache->is_truncated = true; trx_mutex_exit(trx); return; } @@ -934,7 +932,9 @@ static void fetch_data_into_cache_low( table cache buffer. Cache must be locked for write. */ static void fetch_data_into_cache(trx_i_s_cache_t *cache) /*!< in/out: cache */ { - ut_ad(lock_mutex_own()); + /* We are going to iterate over many different shards of lock_sys so we need + exclusive access */ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); trx_i_s_cache_clear(cache); @@ -946,7 +946,7 @@ static void fetch_data_into_cache(trx_i_s_cache_t *cache) /*!< in/out: cache */ /* Capture the state of the read-only active transactions */ fetch_data_into_cache_low(cache, false, &trx_sys->mysql_trx_list); - cache->is_truncated = FALSE; + cache->is_truncated = false; } /** Update the transactions cache if it has not been read for some time. @@ -959,26 +959,21 @@ int trx_i_s_possibly_fetch_data_into_cache( return (1); } - /* We need to read trx_sys and record/table lock queues */ + { + /* We need to read trx_sys and record/table lock queues */ + locksys::Global_exclusive_latch_guard guard{}; - lock_mutex_enter(); + trx_sys_mutex_enter(); - trx_sys_mutex_enter(); + fetch_data_into_cache(cache); - fetch_data_into_cache(cache); - - trx_sys_mutex_exit(); - - lock_mutex_exit(); + trx_sys_mutex_exit(); + } return (0); } -/** Returns TRUE if the data in the cache is truncated due to the memory - limit posed by TRX_I_S_MEM_LIMIT. - @return true if truncated */ -ibool trx_i_s_cache_is_truncated(trx_i_s_cache_t *cache) /*!< in: cache */ -{ +bool trx_i_s_cache_is_truncated(trx_i_s_cache_t *cache) { return (cache->is_truncated); } @@ -987,8 +982,10 @@ void trx_i_s_cache_init(trx_i_s_cache_t *cache) /*!< out: cache to init */ { /* The latching is done in the following order: acquire trx_i_s_cache_t::rw_lock, X - acquire lock mutex - release lock mutex + acquire locksys exclusive global latch + acquire trx_sys mutex + release trx_sys mutex + release locksys exclusive global latch release trx_i_s_cache_t::rw_lock acquire trx_i_s_cache_t::rw_lock, S acquire trx_i_s_cache_t::last_read_mutex @@ -1014,7 +1011,7 @@ void trx_i_s_cache_init(trx_i_s_cache_t *cache) /*!< out: cache to init */ cache->mem_allocd = 0; - cache->is_truncated = FALSE; + cache->is_truncated = false; } /** Free the INFORMATION SCHEMA trx related cache. */ diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index 648282a6b699..bd9f49869f9c 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2020, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the @@ -637,8 +637,7 @@ static ibool trx_rollback_resurrected( ut_ad(trx_sys_mutex_own()); /* The trx->is_recovered flag and trx->state are set - atomically under the protection of the trx->mutex (and - lock_sys->mutex) in lock_trx_release_locks(). We do not want + atomically under the protection of the trx->mutex . We do not want to accidentally clean up a non-recovered transaction here. */ trx_mutex_enter(trx); diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index eb7bf180293b..832b901fe006 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -557,8 +557,7 @@ static void trx_validate_state_before_free(trx_t *trx) { if (trx->n_mysql_tables_in_use != 0 || trx->mysql_n_tables_locked != 0) { ib::error(ER_IB_MSG_1203) - << "MySQL is freeing a thd though" - " trx->n_mysql_tables_in_use is " + << "MySQL is freeing a thd though trx->n_mysql_tables_in_use is " << trx->n_mysql_tables_in_use << " and trx->mysql_n_tables_locked is " << trx->mysql_n_tables_locked << "."; @@ -1820,7 +1819,7 @@ written */ ut_ad(trx->rsegs.m_redo.rseg == nullptr); ut_ad(!trx->in_rw_trx_list); - /* Note: We are asserting without holding the lock mutex. But + /* Note: We are asserting without holding the locksys latch. But that is OK because this transaction is not waiting and cannot be rolled back and no new locks can (or should not) be added because it is flagged as a non-locking read-only transaction. */ @@ -2488,16 +2487,10 @@ void trx_print_low(FILE *f, } } -/** Prints info about a transaction. - The caller must hold lock_sys->mutex and trx_sys->mutex. - When possible, use trx_print() instead. */ -void trx_print_latched( - FILE *f, /*!< in: output stream */ - const trx_t *trx, /*!< in: transaction */ - ulint max_query_len) /*!< in: max query length to print, - or 0 to use the default max length */ -{ - ut_ad(lock_mutex_own()); +void trx_print_latched(FILE *f, const trx_t *trx, ulint max_query_len) { + /* We need exclusive access to lock_sys for lock_number_of_rows_locked(), + and accessing trx->lock fields without trx->mutex.*/ + ut_ad(locksys::owns_exclusive_global_latch()); ut_ad(trx_sys_mutex_own()); trx_print_low(f, trx, max_query_len, lock_number_of_rows_locked(&trx->lock), @@ -2505,27 +2498,11 @@ void trx_print_latched( mem_heap_get_size(trx->lock.lock_heap)); } -/** Prints info about a transaction. - Acquires and releases lock_sys->mutex and trx_sys->mutex. */ -void trx_print(FILE *f, /*!< in: output stream */ - const trx_t *trx, /*!< in: transaction */ - ulint max_query_len) /*!< in: max query length to print, - or 0 to use the default max length */ -{ - ulint n_rec_locks; - ulint n_trx_locks; - ulint heap_size; - - lock_mutex_enter(); - n_rec_locks = lock_number_of_rows_locked(&trx->lock); - n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); - heap_size = mem_heap_get_size(trx->lock.lock_heap); - lock_mutex_exit(); - +void trx_print(FILE *f, const trx_t *trx, ulint max_query_len) { + /* trx_print_latched() requires exclusive global latch */ + locksys::Global_exclusive_latch_guard guard{}; mutex_enter(&trx_sys->mutex); - - trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); - + trx_print_latched(f, trx, max_query_len); mutex_exit(&trx_sys->mutex); } @@ -2547,8 +2524,7 @@ ibool trx_assert_started(const trx_t *trx) /*!< in: transaction */ /* trx->state can change from or to NOT_STARTED while we are holding trx_sys->mutex for non-locking autocommit selects but not for other - types of transactions. It may change from ACTIVE to PREPARED. Unless - we are holding lock_sys->mutex, it may also change to COMMITTED. */ + types of transactions. It may change from ACTIVE to PREPARED. */ switch (trx->state) { case TRX_STATE_PREPARED: @@ -2565,6 +2541,145 @@ ibool trx_assert_started(const trx_t *trx) /*!< in: transaction */ ut_error; } + +/* +Interaction between Lock-sys and trx->mutex-es is rather complicated. +In particular we allow a thread performing Lock-sys operations to request +another trx->mutex even though it already holds one for a different trx. +Therefore one has to prove that it is impossible to form a deadlock cycle in the +imaginary wait-for-graph in which edges go from thread trying to obtain +trx->mutex to a thread which holds it at the moment. + +In the past it was simple, because Lock-sys was protected by a global mutex, +which meant that there was at most one thread which could try to posses more +than one trx->mutex - one can not form a cycle in a graph in which only +one node has both incoming and outgoing edges. + +Today it is much harder to prove, because we have sharded the Lock-sys mutex, +and now multiple threads can perform Lock-sys operations in parallel, as long +as they happen in different shards. + +Here's my attempt at the proof. + +Assumption 1. + If a thread attempts to acquire more then one trx->mutex, then it either has + exclusive global latch, or it attempts to acquire exactly two of them, and at + just before calling mutex_enter for the second time it saw + trx1->lock.wait_lock==nullptr, trx2->lock.wait_lock!=nullptr, and it held the + latch for the shard containing trx2->lock.wait_lock. + +@see asserts in trx_before_mutex_enter + +Assumption 2. + The Lock-sys latches are taken before any trx->mutex. + +@see asserts in sync0debug.cc + +Assumption 3. + Changing trx->lock.wait_lock from NULL to non-NULL requires latching + trx->mutex and the shard containing new wait_lock value. + +@see asserts in lock_set_lock_and_trx_wait() + +Assumption 4. + Changing trx->lock.wait_lock from non-NULL to NULL requires latching the shard + containing old wait_lock value. + +@see asserts in lock_reset_lock_and_trx_wait() + +Assumption 5. + If a thread is latching two Lock-sys shards then it's acquiring and releasing + both shards together (that is, without interleaving it with trx->mutex + operations). + +@see Shard_latches_guard + +Theorem 1. + If the Assumptions 1-5 hold, then it's impossible for trx_mutex_enter() call + to deadlock. + +By proving the theorem, and observing that the assertions hold for multiple runs +of test suite on debug build, we gain more and more confidence that +trx_mutex_enter() calls can not deadlock. + +The intuitive, albeit imprecise, version of the proof is that by Assumption 1 +each edge of the deadlock cycle leads from a trx with NULL trx->lock.wait_lock +to one with non-NULL wait_lock, which means it has only one edge. + +The difficulty lays in that wait_lock is a field which can be modified over time +from several threads, so care must be taken to clarify at which moment in time +we make our observations and from whose perspective. + +We will now formally prove Theorem 1. +Assume otherwise, that is that we are in a thread which have just started a call +to mutex_enter(trx_a->mutex) and caused a deadlock. + +Fact 0. There is no thread which possesses exclusive Lock-sys latch, since to + form a deadlock one needs at least two threads inside Lock-sys +Fact 1. Each thread participating in the deadlock holds one trx mutex and waits + for the second one it tried to acquire +Fact 2. Thus each thread participating in the deadlock had gone through "else" + branch inside trx_before_mutex_enter(), so it verifies Assumption 1. +Fact 3. Our thread owns_lock_shard(trx_a->lock.wait_lock) +Fact 4. Another thread has latched trx_a->mutex as the first of its two latches + +Consider the situation from the point of view of this other thread, which is now +in the deadlock waiting for mutex_enter(trx_b->mutex) for some trx_b!=trx_a. +By Fact 2 and assumption 1, it had to take the "else" branch on the way there, +and thus it has saw: trx_a->lock.wait_lock == nullptr at some moment in time. +This observation was either before or after our observation that +trx_a->lock.wait_lock != nullptr (again Fact 2 and Assumption 1). + +If our thread observed non-NULL value first, then it means a change from +non-NULL to NULL has happened, which by Assumption 4 requires a shard latch, +which only our thread posses - and we couldn't manipulate the wait_lock as we +are in a deadlock. + +If the other thread observed NULL first, then it means that the value has +changed to non-NULL, which requires trx_a->mutex according to Assumption 3, yet +this mutex was held entire time by the other thread, since it observed the NULL +just before it deadlock, so it could not change it, either. + +So, there is no way the value of wait_lock has changed from NULL to non-NULL or +vice-versa, yet one thread sees NULL and the other non-NULL - contradiction ends +the proof. +*/ + +static thread_local const trx_t *trx_first_latched_trx = nullptr; +static thread_local int32_t trx_latched_count = 0; +static thread_local bool trx_allowed_two_latches = false; + +void trx_before_mutex_enter(const trx_t *trx, bool first_of_two) { + if (0 == trx_latched_count++) { + ut_a(trx_first_latched_trx == nullptr); + trx_first_latched_trx = trx; + if (first_of_two) { + trx_allowed_two_latches = true; + } + } else { + ut_a(!first_of_two); + if (!locksys::owns_exclusive_global_latch()) { + ut_a(trx_allowed_two_latches); + ut_a(trx_latched_count == 2); + ut_a(trx_first_latched_trx->lock.wait_lock == nullptr); + ut_a(trx_first_latched_trx != trx); + /* This is not very safe, because to read trx->lock.wait_lock we + should already either latch trx->mutex (which we don't) or shard with + trx->lock.wait_lock. But our claim is precisely that we have latched + this shard, and we want to check that here. */ + ut_a(trx->lock.wait_lock != nullptr); + ut_a(locksys::owns_lock_shard(trx->lock.wait_lock)); + } + } +} +void trx_before_mutex_exit(const trx_t *trx) { + ut_a(0 < trx_latched_count); + if (0 == --trx_latched_count) { + ut_a(trx_first_latched_trx == trx); + trx_first_latched_trx = nullptr; + trx_allowed_two_latches = false; + } +} #endif /* UNIV_DEBUG */ /** Compares the "weight" (or size) of two transactions. Transactions that @@ -2574,6 +2689,8 @@ ibool trx_assert_started(const trx_t *trx) /*!< in: transaction */ bool trx_weight_ge(const trx_t *a, /*!< in: transaction to be compared */ const trx_t *b) /*!< in: transaction to be compared */ { + /* To read TRX_WEIGHT we need a exclusive global lock_sys latch */ + ut_ad(locksys::owns_exclusive_global_latch()); ibool a_notrans_edit; ibool b_notrans_edit; @@ -2861,7 +2978,7 @@ int trx_recover_for_mysql( /* The state of a read-write transaction cannot change from or to NOT_STARTED while we are holding the trx_sys->mutex. It may change to PREPARED, but not if - trx->is_recovered. It may also change to COMMITTED. */ + trx->is_recovered. */ if (trx_state_eq(trx, TRX_STATE_PREPARED)) { if (get_info_about_prepared_transaction(&txn_list[count], trx, mem_root)) break; @@ -2899,8 +3016,7 @@ int trx_recover_for_mysql( /** This function is used to find one X/Open XA distributed transaction which is in the prepared state @return trx on match, the trx->xid will be invalidated; - note that the trx may have been committed, unless the caller is - holding lock_sys->mutex */ + */ static MY_ATTRIBUTE((warn_unused_result)) trx_t *trx_get_trx_by_xid_low( const XID *xid) /*!< in: X/Open XA transaction identifier */ @@ -2930,14 +3046,7 @@ static MY_ATTRIBUTE((warn_unused_result)) trx_t *trx_get_trx_by_xid_low( return (trx); } -/** This function is used to find one X/Open XA distributed transaction - which is in the prepared state - @return trx or NULL; on match, the trx->xid will be invalidated; - note that the trx may have been committed, unless the caller is - holding lock_sys->mutex */ -trx_t *trx_get_trx_by_xid( - const XID *xid) /*!< in: X/Open XA transaction identifier */ -{ +trx_t *trx_get_trx_by_xid(const XID *xid) { trx_t *trx; if (xid == nullptr) {