diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender.go
index b201d985cf59..259e616dc1c4 100644
--- a/pkg/kv/kvclient/kvcoord/txn_coord_sender.go
+++ b/pkg/kv/kvclient/kvcoord/txn_coord_sender.go
@@ -586,9 +586,25 @@ func (tc *TxnCoordSender) Send(
 // transactions are guaranteed to start with higher timestamps, regardless
 // of the gateway they use. This ensures that all causally dependent
 // transactions commit with higher timestamps, even if their read and writes
-// sets do not conflict with the original transaction's. This obviates the
-// need for uncertainty intervals and prevents the "causal reverse" anamoly
-// which can be observed by a third, concurrent transaction.
+// sets do not conflict with the original transaction's. This prevents the
+// "causal reverse" anomaly which can be observed by a third, concurrent
+// transaction.
+//
+// Even when in linearizable mode and performing this extra wait on the commit
+// of read-write transactions, uncertainty intervals are still necessary. This
+// is to ensure that any two reads that touch overlapping keys but are executed
+// on different nodes obey real-time ordering and do not violate the "monotonic
+// reads" property. Without uncertainty intervals, it would be possible for a
+// read on a node with a fast clock (ts@15) to observe a committed value (ts@10)
+// and then a later read on a node with a slow clock (ts@5) to miss the
+// committed value. When contrasting this with Google Spanner, we notice that
+// Spanner performs a similar commit-wait but then does not include uncertainty
+// intervals. The reason this works in Spanner is that read-write transactions
+// in Spanner hold their locks across the commit-wait duration, which blocks
+// concurrent readers and enforces real-time ordering between any two readers as
+// well between the writer and any future reader. Read-write transactions in
+// CockroachDB do not hold locks across commit-wait (they release them before),
+// so the uncertainty interval is still needed.
 //
 // For more, see https://www.cockroachlabs.com/blog/consistency-model/ and
 // docs/RFCS/20200811_non_blocking_txns.md.
diff --git a/pkg/util/hlc/BUILD.bazel b/pkg/util/hlc/BUILD.bazel
index 630ed7223789..83a0b713ab03 100644
--- a/pkg/util/hlc/BUILD.bazel
+++ b/pkg/util/hlc/BUILD.bazel
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "hlc",
     srcs = [
+        "doc.go",
         "hlc.go",
         "hlc_clock_device_linux.go",
         "hlc_clock_device_stub.go",
diff --git a/pkg/util/hlc/doc.go b/pkg/util/hlc/doc.go
new file mode 100644
index 000000000000..717ee45e3d69
--- /dev/null
+++ b/pkg/util/hlc/doc.go
@@ -0,0 +1,288 @@
+// Copyright 2021 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+/*
+Package hlc implements the Hybrid Logical Clock outlined in "Logical Physical
+Clocks and Consistent Snapshots in Globally Distributed Databases", available
+online at http://www.cse.buffalo.edu/tech-reports/2014-04.pdf.
+
+Each node within a CockroachDB cluster maintains a hybrid logical clock (HLC),
+which provides timestamps that are a combination of physical and logical time.
+Physical time is based on a node’s coarsely-synchronized system clock, and
+logical time is based on Lamport’s clocks.
+
+Hybrid-logical clocks provide a few important properties:
+
+Causality tracking
+
+HLCs provide causality tracking through a combination of a physical and logical
+(to break ties) component upon each inter-node exchange. Nodes attach HLC
+timestamps to each message that they send and use HLC timestamps from each
+message that they receive to update their local clock.
+
+There are currently three channels through which HLC timestamps are passed
+between nodes in a cluster:
+
+ - Raft (unidirectional): proposers of Raft commands (i.e. leaseholders) attach
+   clock readings to these command, which are later consumed by followers when
+   commands are applied to their Raft state machine.
+
+   Ref: (kvserverpb.ReplicatedEvalResult).WriteTimestamp.
+   Ref: (roachpb.MergeTrigger).FreezeStart.
+
+ - BatchRequest API (bidirectional): clients and servers of the KV BatchRequest
+   API will attach HLC clock readings on requests and responses (successes and
+   errors).
+
+   Ref: (roachpb.Header).Timestamp.
+   Ref: (roachpb.BatchResponse_Header).Now.
+   Ref: (roachpb.Error).Now.
+
+ - DistSQL flows (unidirectional): leaves of a DistSQL flow will pass clock
+   readings back to the root of the flow. Currently, this only takes place on
+   errors, and relates to the "Transaction retry errors" interaction detailed
+   below.
+
+   Ref: (roachpb.Error).Now.
+
+Capturing causal relationships between events on different nodes is critical for
+enforcing invariants within CockroachDB. What follows is an enumeration of each
+of the interactions in which causality passing through an HLC is necessary for
+correctness. It is intended to be exhaustive.
+
+ - Cooperative lease transfers (Raft channel). During a cooperative lease
+   transfer from one replica of a range to another, the outgoing leaseholder
+   revokes its lease before its expiration time and consults its clock to
+   determine the start time of the next lease. It then proposes this new lease
+   through Raft (see the raft channel above) with a clock reading attached that
+   is >= the new lease's start time. Upon application of this Raft entry, the
+   incoming leaseholder forwards its HLC to this clock reading, transitively
+   ensuring that its clock is >= the new lease's start time.
+
+   The invariant that a leaseholder's clock is always >= its lease's start time
+   is used in a few places. First, it ensures that the leaseholder's clock
+   always leads the written_timestamp of any value in its keyspace written by a
+   prior leaseholder on its range, which is an important property for the
+   correctness of observed timestamps. Second, it ensures that the leaseholder
+   immediately views itself as the leaseholder. Third, it ensures that if the
+   new leaseholder was to transfer the lease away at some point in the future,
+   this later lease's start time could be pulled from the local clock and be
+   guaranteed to receive an even greater starting timestamp.
+
+ - Range merges (Raft + BatchRequest channels). During a merge of two ranges,
+   the right-hand side of the merge passes a "frozen timestamp" clock reading
+   from the right-hand side leaseholder, through the merge transaction
+   coordinator, all the way to the left-hand side's leaseholder. This timestamp
+   is captured after the right-hand side has been subsumed and has stopped
+   serving KV traffic. When the left-hand side's leaseholder applies the range
+   merge and officially takes control of the combined range, it forwards its HLC
+   to this frozen timestamp. Like the previous interaction, this one is also
+   necessary to ensure that the leaseholder of the joint range has a clock that
+   leads the written_timestamp of any value in its keyspace, even one written
+   originally on the right-hand side range.
+
+ - Observed timestamps (Raft + BatchRequest channels). During the lifetime of a
+   transaction, its coordinator issues BatchRequests to other nodes in the
+   cluster. Each time a given transaction visits a node for the first time, it
+   captures an observation from the node's HLC. Separately, when a leaseholder
+   on a given node serves a write, it ensures that the node's HLC clock is >=
+   the written_timestamp of the write. This written_timestamp is retained even
+   if an intent is moved to a higher timestamp if it is asynchronously resolved.
+   As a result, these "observed timestamps" captured during the lifetime of a
+   transaction can be used to make a claim about values that could not have been
+   written yet at the time that the transaction first visited the node, and by
+   extension, at the time that the transaction began. This allows the
+   transaction to avoid uncertainty restarts in some circumstances. For more,
+   see pkg/kv/kvserver/observedts/doc.go.
+
+ - Non-transactional requests (Raft + BatchRequest channels). Most KV operations
+   in CockroachDB are transactional and receive their read timestamps from their
+   gateway's HLC clock they are instantiated. They use an uncertainty interval
+   (see below) to avoid stale reads in the presence of clock skew.
+
+   The KV API also exposes the option to elide the transaction for requests
+   targeting a single range (which trivially applies to all point requests).
+   These requests do not carry a predetermined read timestamp; instead, this
+   timestamp is chosen from the HLC upon arrival at the leaseholder for the
+   range. Since the HLC clock always leads the timestamp if any write served
+   on the range, this will not result in stale reads, despite not using an
+   uncertainty interval for such requests.
+
+   TODO(nvanbenschoten): this mechanism is currently broken for future-time
+   writes. We either need to give non-transactional requests uncertainty
+   intervals or remove them. See https://github.com/cockroachdb/cockroach/issues/58459.
+
+ - Transaction retry errors (BatchRequest and DistSQL channels).
+   TODO(nvanbenschoten/andreimatei): is this a real case where passing a remote
+   clock signal through the local clock is necessary? The DistSQL channel and
+   its introduction in 72fa944 seem to imply that it is, but I don't really see
+   it, because we don't use the local clock to determine which timestamp to
+   restart the transaction at. Maybe we were just concerned about a transaction
+   restarting at a timestamp above the local clock back then because we had yet
+   to separate the "clock timestamp" domain from the "transaction timestamp"
+   domain.
+
+Strict monotonicity
+
+HLCs, as implemented by CockroachDB, provide strict monotonicity within and
+across restarts on a single node. Within a continuous process, providing this
+property is trivial. Across restarts, this property is enforced by waiting out
+the maximum clock offset upon process startup before serving any requests in
+ensureClockMonotonicity.
+
+The cluster setting server.clock.persist_upper_bound_interval also provides
+additional protection here by persisting the wall time of the clock
+periodically, although this protection is disabled by default.
+
+Strictly monotonic timestamp allocation ensures that two causally dependent
+transactions originating from the same node are given timestamps that reflect
+their ordering in real time. However, this itself is not a crucial guarantee,
+as CockroachDB does not and can not assume that causally dependent operations
+originate from the same node.
+
+Strictly monotonic timestamp allocation underpins the causality tracking uses
+detailed above.
+
+Self-stabilization
+
+HLCs provide self-stabilization in the presence of isolated transient clock skew
+fluctuations. As stated above, a node forwards its HLC upon its receipt of a
+network message. The effect of this is that given sufficient intra-cluster
+communication, HLCs across nodes tend to converge and stabilize even if their
+individual physical clocks diverge. This provides no strong guarantees but can
+mask clock synchronization errors in practice.
+
+Bounded skew
+
+HLCs within a CockroachDB deployment are configured with a maximum allowable
+offset between their physical time component and that of other HLCs in the
+cluster. This is referred to as the MaxOffset. The configuration is static and
+must be identically configured on all nodes in a cluster, which is enforced by
+the HeartbeatService.
+
+The notion of a maximum clock skew at all times between all HLCs in a cluster is
+a foundational assumption used in different parts of the system. What follows is
+an enumeration of the interactions that assume a bounded clock skew, along with
+a discussion for each about the consequences of that assumption being broken and
+the maximum clock skew between two nodes in the cluster exceeding the configured
+limit.
+
+ - Transaction uncertainty intervals. The single-key linearizability property is
+   satisfied in CockroachDB by tracking an uncertainty interval for each
+   transaction, within which the real-time ordering between two transactions is
+   indeterminate. Upon its creation, a transaction is given a provisional commit
+   timestamp commit_ts from the transaction coordinator’s local HLC and an
+   uncertainty interval of [commit_ts, commit_ts + max_offset].
+
+   When a transaction encounters a value on a key at a timestamp below its
+   provisional commit timestamp, it trivially observes the value during reads
+   and overwrites the value at a higher timestamp during writes. This alone
+   would satisfy single-key linearizability if transactions had access to a
+   perfectly synchronized global clock.
+
+   Without global synchronization, the uncertainty interval is needed because it
+   is possible for a transaction to receive a provisional commit timestamp up to
+   the cluster’s max_offset earlier than a transaction that causally preceded
+   this new transaction in real time. When a transaction encounters a value on a
+   key at a timestamp above its provisional commit timestamp but within its
+   uncertainty interval, it performs an uncertainty restart, moving its
+   provisional commit timestamp above the uncertain value but keeping the upper
+   bound of its uncertainty interval fixed.
+
+   This corresponds to treating all values in a transaction’s uncertainty window
+   as past writes. As a result, the operations on each key performed by
+   transactions take place in an order consistent with the real time ordering of
+   those transactions.
+
+   HAZARD: If the maximum clock offset is exceeded, it is possible for a
+   transaction to serve a stale read that violates single-key linearizability.
+   For example, it is possible for a transaction A to write to a key and commit
+   at a timestamp t1, then for its client to hear about the commit. The client
+   may then initiate a second transaction B on a different gateway that has a
+   slow clock. If this slow clock is more than max_offset from other clocks in
+   the system, it is possible for transaction B's uncertainty interval not to
+   extend up to t1 and therefore for a read of the key that transaction A wrote
+   to be missed. Notably, this is a violation of consistency (linearizability)
+   but not of isolation (serializability) — transaction isolation has no clock
+   dependence.
+
+ - Non-cooperative lease transfers. In the happy case, range leases move from
+   replica to replica using a coordinated handoff. However, in the unhappy case
+   where a leaseholder crashes or otherwise becomes unresponsive, other replicas
+   are able to attempt to acquire a new lease for the range as soon as they
+   observe the old lease expire. In this case, the max_offset plays a role in
+   ensuring that two replicas do not both consider themselves the leaseholder
+   for a range at the same (wallclock) time. This is ensured by designating a
+   "stasis" period equal in size to the max_offset at the end of each lease,
+   immediately before its expiration, as unusable. By preventing a lease from
+   being used within this stasis period, two replicas will never think that they
+   hold a valid lease at the same time, even if the outgoing leaseholder has a
+   slow clock and the incoming leaseholder has a fast clock (within bounds). For
+   more, see LeaseState_UNUSABLE.
+
+   Note however that it is easy to overstate the salient point here if one is
+   not careful. Lease start and end times operate in the MVCC time domain, and
+   any two leases are always guaranteed to cover disjoint intervals of MVCC
+   time. Leases entitle their holder to serve reads at any MVCC time below their
+   expiration and to serve writes at any MVCC time at or after their start time
+   and below their expiration. Additionally, the lease sequence is attached to
+   all writes and checked during Raft application, so a stale leaseholder is
+   unable to perform a write after it has been replaced (in "consensus time").
+   This combines to mean that even if two replicas believed that they hold the
+   lease for a range at the same time, they can not perform operations that
+   would be incompatible with one another (e.g. two conflicting writes). Again,
+   transaction isolation has no clock dependence.
+
+   HAZARD: If the maximum clock offset is exceeded, it is possible for two
+   replicas to both consider themselves leaseholders at the same time. This can
+   not lead to stale reads for transactional requests, because a transaction
+   with an uncertainty interval that extends past a lease's expiration will not
+   be able to use that lease to perform a read (which is enforced by a stasis
+   period immediately before its expiration). However, because some
+   non-transactional requests receive their timestamp on the server and do not
+   carry an uncertainty interval, they would be susceptible to stale reads
+   during this period. This is equivalent to the hazard for operations that do
+   use uncertainty intervals, but the mechanics differ slightly.
+
+ - "Linearizable" transactions. By default, transactions in CockroachDB provide
+   single-key linearizability and guarantee that as long as clock skew remains
+   below the configured bounds, transactions will not serve stale reads.
+   However, by default, transactions do not provide strict serializability, as
+   they are susceptible to the "causal reverse" anomaly.
+
+   However, CockroachDB does supports a stricter model of consistency through
+   its COCKROACH_EXPERIMENTAL_LINEARIZABLE environment variable. When in
+   "linearizable" mode (also known as "strict serializable" mode), all writing
+   transactions (but not read-only transactions) must wait ("commit-wait") an
+   additional max_offset after committing to ensure that their commit timestamp
+   is below the current HLC clock time of any other node in the system. In doing
+   so, all causally dependent transactions are guaranteed to start with higher
+   timestamps, regardless of the gateway they use. This ensures that all
+   causally dependent transactions commit with higher timestamps, even if their
+   read and writes sets do not conflict with the original transaction's. This
+   prevents the "causal reverse" anomaly which can be observed by a third,
+   concurrent transaction.
+
+   HAZARD: If the maximum clock offset is exceeded, it is possible that even
+   after a transaction commit-waits for the full max_offset, a causally
+   dependent transaction that evaluates on a different gateway node receives and
+   commits with an earlier timestamp. This resuscitates the possibility of the
+   causal reverse anomaly, along with the possibility for stale reads, as
+   detailed above.
+
+   HAZARD: This mode of operation is completely untested.
+
+To reduce the likelihood of stale reads, nodes periodically measure their
+clock’s offset from other nodes. If any node exceeds the configured maximum
+offset by more than 80% compared to a majority of other nodes, the node in the
+minority self-terminates. This best-effort validation is done in
+(rpc.RemoteClockMonitor).VerifyClockOffset.
+*/
+package hlc
diff --git a/pkg/util/hlc/hlc.go b/pkg/util/hlc/hlc.go
index b83ade25f4dd..794ec7b753d6 100644
--- a/pkg/util/hlc/hlc.go
+++ b/pkg/util/hlc/hlc.go
@@ -8,10 +8,6 @@
 // by the Apache License, Version 2.0, included in the file
 // licenses/APL.txt.
 
-// Package hlc implements the Hybrid Logical Clock outlined in
-// "Logical Physical Clocks and Consistent Snapshots in Globally
-// Distributed Databases", available online at
-// http://www.cse.buffalo.edu/tech-reports/2014-04.pdf.
 package hlc
 
 import (