pkg/roachpb/api.proto

// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

syntax = "proto3";
package cockroach.roachpb;
option go_package = "roachpb";

import "errorspb/errors.proto";
import "kv/kvserver/concurrency/lock/locking.proto";
import "kv/kvserver/readsummary/rspb/summary.proto";
import "roachpb/data.proto";
import "roachpb/errors.proto";
import "roachpb/metadata.proto";
import "roachpb/span_config.proto";
import "settings/encoding.proto";
import "storage/enginepb/mvcc.proto";
import "storage/enginepb/mvcc3.proto";
import "util/hlc/timestamp.proto";
import "util/tracing/tracingpb/recorded_span.proto";
import "util/tracing/tracingpb/tracing.proto";
import "gogoproto/gogo.proto";
import "google/protobuf/duration.proto";

// ReadConsistencyType specifies what type of consistency is observed
// during read operations.
enum ReadConsistencyType {
  option (gogoproto.goproto_enum_prefix) = false;

  // CONSISTENT reads are guaranteed to read committed data; the
  // mechanism relies on clocks to determine lease expirations.
  CONSISTENT = 0;
  // READ_UNCOMMITTED reads return both committed and uncommitted data.
  // The consistency type is similar to INCONSISTENT in that using it
  // can result in dirty reads. However, like the CONSISTENT type, it
  // requires the replica performing the read to hold a valid read lease,
  // meaning that it can't return arbitrarily stale data.
  READ_UNCOMMITTED = 1;
  // INCONSISTENT reads return the latest available, committed values.
  // They are more efficient, but may read stale values as pending
  // intents are ignored.
  INCONSISTENT = 2;
}

// RoutingPolicy specifies how a request should be routed to the
// replicas of its target range(s) by the DistSender. Policies can
// dictate which replicas are considered to be targets and in which
// order.
enum RoutingPolicy {
  // LEASEHOLDER means that the DistSender should route the request to the
  // leaseholder replica(s) of its target range(s).
  LEASEHOLDER = 0;
  // NEAREST means that the DistSender should route the request to the
  // nearest replica(s) of its target range(s).
  NEAREST = 1;
}

// ResumeReason specifies why a ResumeSpan was generated instead of a
// complete result.
enum ResumeReason {
  option (gogoproto.goproto_enum_prefix) = false;
  // Zero value; no resume, or an unknown reason from a future or past cockroachdb version.
  RESUME_UNKNOWN = 0;
  // A key limit was exceeded, i.e. MaxSpanRequestKeys.
  RESUME_KEY_LIMIT = 1;
  // A byte limit was exceeded, i.e. TargetBytes.
  // NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
  RESUME_BYTE_LIMIT = 2;
  // An intent limit was exceeded. This is currently never returned to clients,
  // since MVCCScan converts the result into a WriteIntentError.
  // NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
  RESUME_INTENT_LIMIT = 3;
  // The DistSender encountered a range boundary and returned a partial result,
  // in response to return_on_range_boundary.
  RESUME_RANGE_BOUNDARY = 4;
}

// RequestHeader is supplied with every storage node request.
message RequestHeader {
  reserved 1, 2;
  // The key for request. If the request operates on a range, this
  // represents the starting key for the range.
  bytes key = 3 [(gogoproto.casttype) = "Key"];
  // The end key is empty if the request spans only a single key. Otherwise,
  // it must order strictly after Key. In such a case, the header indicates
  // that the operation takes place on the key range from Key to EndKey,
  // including Key and excluding EndKey.
  bytes end_key = 4 [(gogoproto.casttype) = "Key"];
  // A zero-indexed transactional sequence number.
  int32 sequence = 5 [
    (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
}

// ResponseHeader is returned with every storage node response. Note that this is
// different from a BatchResponse's header.
message ResponseHeader {
  // txn is non-nil if the request specified a non-nil transaction.
  // The transaction timestamp and/or priority may have been updated,
  // depending on the outcome of the request.
  //
  // Once txn is merged into the BatchResponse_Header.Txn, it will be
  // reset to nil to avoid sending superfluous information over the
  // network.
  Transaction txn = 3;
  // The next span to resume from when the response doesn't cover the full span
  // requested. This can happen when a bound on the result size is set through
  // max_span_request_keys or target_bytes in the batch header or when a scan
  // has been stopped before covering the requested data because of
  // scan_options.
  //
  // ResumeSpan is unset when the entire span of keys have been
  // operated on. The span is set to the original span if the request
  // was ignored because max_span_request_keys was hit due to another
  // request in the batch. For a reverse scan the end_key is updated.
  Span resume_span = 4;
  // When resume_span is populated, this specifies the reason why the operation
  // wasn't completed and needs to be resumed.
  ResumeReason resume_reason = 7;
  // When resume_reason is RESUME_BYTE_LIMIT, this may contain the size of the
  // next result entry which caused the limit to be exceeded, i.e. the size of
  // the first entry when reading from the resume span. It is only supported by
  // Get and Scan. In a batch, this will only be set on the first response that
  // exceeds the limit.
  //
  // NB: This is best-effort, and may be 0 in some rare cases. Specifically, if
  // TargetBytes is exactly satisfied by a result that exhausted a range scan,
  // or by a response from a multi-request batch, we won't do additional work
  // (e.g. send another RPC to the next range) only to obtain resume_next_bytes.
  //
  // Also note that this is unaffected by whole_rows_of_size. The client may
  // care about whole rows, but we'll only return the size of the next KV pair
  // (which may just be part of the row), to avoid the cost of additional IO.
  int64 resume_next_bytes = 9;

  // The number of keys operated on.
  int64 num_keys = 5;
  // The number of bytes returned. Only populated for requests that support it
  // (at the time of writing, Scan, ReverseScan and ExportRequest). The number
  // returned here corresponds to the (Header).TargetBytes field and loosely
  // measures the bytes in the timestamps, keys, and values of the returned
  // rows.
  int64 num_bytes = 8;
  reserved 6;
}

// A GetRequest is the argument for the Get() method.
message GetRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The desired key-level locking mode used during this get. When set to None
  // (the default), no key-level locking mode is used - meaning that the get
  // does not acquire a lock. When set to any other strength, a lock of that
  // strength is acquired with the Unreplicated durability (i.e. best-effort)
  // the key, if it exists.
  kv.kvserver.concurrency.lock.Strength key_locking = 2;
}

// A GetResponse is the return value from the Get() method.
// If the key doesn't exist, Value will be nil.
message GetResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Value value = 2;

  // The intent seen, if any, when using the READ_UNCOMMITTED consistency level.
  //
  // NOTE: this field is not currently populated with intents for deletion
  // tombstones. It probably should be because the value field may contain a
  // value that is being deleted by a corresponding intent. We should revisit
  // this decision if this ever becomes a problem.
  Value intent_value = 3;
}

// A ProbeRequest is an internal request type used to send a replicated
// no-op through a Range as a means of probing write availability. The
// request will be serialized like a regular write, i.e. will acquire
// latches, and declare key access, but it will not check locks (i.e.
// if an intent exists on the key that is being probed, the probe will
// not observe it). ProbeRequest can be served by any Replica including
// followers, i.e. it can be used to verify that a given Replica is able
// to access the replication layer.
message ProbeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A ProbeResponse is the response to a ProbeRequest.
message ProbeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A PutRequest is the argument to the Put() method.
message PutRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Value value = 2 [(gogoproto.nullable) = false];
  // Specify as true to put the value without a corresponding
  // timestamp. This option should be used with care as it precludes
  // the use of this value with transactions.
  bool inline = 3;
  // NOTE: For internal use only! Set to indicate that the put is
  // writing to virgin keyspace and no reads are necessary to
  // rationalize MVCC.
  bool blind = 4;
}

// A PutResponse is the return value from the Put() method.
message PutResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A ConditionalPutRequest is the argument to the ConditionalPut() method.
//
// - Returns true and sets value if exp_bytes equals existing value.
// - If key doesn't exist and exp_bytes is empty, sets value.
// - Otherwise, returns a ConditionFailedError containing the actual value of the key.
//
// Note that the client is free to send more requests after a
// ConditionFailedError. This is not generally allowed after other errors
// because of fears over the ambiguity of the side-effects of failed requests
// (in particular, the timestamps at which intents might have been written).
// ConditionFailedError is a special case as we ensure there's no ambiguity; the
// error carries a WriteTimestamp that's the upper bound of the timestamps
// intents were written at.
message ConditionalPutRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // The value to put.
  Value value = 2 [(gogoproto.nullable) = false];
  // deprecated_exp_val represents the expected existing value for the key. If
  // the existing value is different, the request will return a
  // ConditionFailedError. A missing (Go nil) deprecated_exp_value.raw_bytes
  // means that the key is expected to not exist.
  //
  // This is deprecated in 20.2 in favor of exp_bytes, which clarifies that the
  // checksum and timestamp of the expected value are irrelevant. Remove in
  // 21.1.
  Value deprecated_exp_value = 3;
  // exp_bytes represents the expected existing value for the key. If empty, the
  // key is expected to not exist. If not empty, these bytes are expected to
  // contain the tag and data of the existing value (without the existing
  // value's checksum; the byte array is expected to come from
  // Value.TagAndDataBytes()). A value's checksum covers the key in addition to
  // covering the value, so not including a checksum here makes for a easier to
  // use API - the creator of the ConditionalPutRequest can simply put in bytes
  // coming from a different key.
  // Note that there's no such thing as expecting a key to exist, but have an
  // empty value. Such key-values don't exist.
  //
  // Note that the existing value's timestamp doesn't matter, only its data. So,
  // the CPut will succeed in ABA situations (if a reader got value A and checks
  // against it later, the check will succeed even if, in the meantime, there's
  // been a subsequent write of value B and another one back to value A).
  bytes exp_bytes = 6;
  // NOTE: For internal use only! Set to indicate that the put is
  // writing to virgin keyspace and no reads are necessary to
  // rationalize MVCC.
  bool blind = 4;
  // Typically if a specific, non-empty expected value is supplied, it *must*
  // exist with that value. Passing this indicates that it is also OK if the key
  // does not exist. This is useful when a given value is expected but it is
  // possible it has not yet been written.
  bool allow_if_does_not_exist = 5;
  // Specify as true to put the value without a corresponding
  // timestamp. This option should be used with care as it precludes
  // the use of this value with transactions.
  bool inline = 7;
}

// A ConditionalPutResponse is the return value from the
// ConditionalPut() method.
message ConditionalPutResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An InitPutRequest is the argument to the InitPut() method.
//
// - If key doesn't exist, sets value.
// - If key exists, returns a ConditionFailedError if value != existing value
//   If failOnTombstones is set to true, tombstone values count as mismatched
//   values and will cause a ConditionFailedError.
message InitPutRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Value value = 2 [(gogoproto.nullable) = false];
  // NOTE: For internal use only! Set to indicate that the put is
  // writing to virgin keyspace and no reads are necessary to
  // rationalize MVCC.
  bool blind = 3;
  // If true, tombstones cause ConditionFailedErrors.
  bool failOnTombstones = 4;
}

// A InitPutResponse is the return value from the InitPut() method.
message InitPutResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An IncrementRequest is the argument to the Increment() method. It
// increments the value for key, and returns the new value. If no
// value exists for a key, incrementing by 0 is not a noop, but will
// create a zero value. IncrementRequest cannot be called on a key set
// by Put() or ConditionalPut(). Similarly, Put() and ConditionalPut()
// cannot be invoked on an incremented key.
message IncrementRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  int64 increment = 2;
}

// An IncrementResponse is the return value from the Increment
// method. The new value after increment is specified in NewValue. If
// the value could not be decoded as specified, Error will be set.
message IncrementResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  int64 new_value = 2;
}

// A DeleteRequest is the argument to the Delete() method.
message DeleteRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A DeleteResponse is the return value from the Delete() method.
message DeleteResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A DeleteRangeRequest is the argument to the DeleteRange() method. It
// specifies the range of keys to delete.
//
// A DeleteRangeRequest populates the timestamp cache and is tracked for
// refreshes.
message DeleteRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  reserved 2;
  // return the keys that are deleted in the response.
  bool return_keys = 3;
  // delete "inline" keys which are stored without MVCC timestamps. Note that
  // an "inline" DeleteRange will fail if it attempts to delete any keys which
  // contain timestamped (non-inline) values; this option should only be used on
  // keys which are known to store inline values, such as data in cockroach's
  // time series system.
  //
  // Similarly, attempts to delete keys with inline values will fail unless this
  // flag is set to true; the setting must match the data being deleted.
  //
  // Inline values cannot be deleted transactionally; a DeleteRange with
  // "inline" set to true will fail if it is executed within a transaction.
  bool inline = 4;
}

// A DeleteRangeResponse is the return value from the DeleteRange()
// method.
message DeleteRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // All the deleted keys if return_keys is set.
  repeated bytes keys = 2 [(gogoproto.casttype) = "Key"];
}

// A ClearRangeRequest is the argument to the ClearRange() method. It
// specifies a range of keys to clear from the underlying engine. Note
// that this differs from the behavior of DeleteRange, which sets
// transactional intents and writes tombstones to the deleted
// keys. ClearRange is used when permanently dropping or truncating
// table data.
//
// ClearRange also updates the GC threshold for the range to the
// timestamp at which this command executes, to prevent reads at
// earlier timestamps from incorrectly returning empty results.
//
// NOTE: it is important that this method only be invoked on a key
// range which is guaranteed to be both inactive and not see future
// writes (until Deadline, if set, below).
// Ignoring this warning may result in data loss.
message ClearRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // Deadline can be set to a time at or after which the server will refuse to
  // execute this ClearRange request, providing a form of replay protection:
  // if a caller sets this to a time in the near future, they can then wait for
  // that time (plus max offset) to have passed at which point they can reuse
  // the span they cleared without fear of this request being replayed later and
  // clearing subsequent writes.
  util.hlc.Timestamp deadline = 2 [(gogoproto.nullable) = false];
}

// A ClearRangeResponse is the return value from the ClearRange() method.
message ClearRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}


// A RevertRangeRequest specifies a range of keys in which to clear all MVCC
// revisions more recent than some TargetTime from the underlying engine, thus
// reverting the range (from the perspective of an MVCC scan) to that time.
message RevertRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // TargetTime specifies a the time to which to "revert" the range by clearing
  // any MVCC key with a strictly higher timestamp. TargetTime must be higher
  // than the GC Threshold for the replica - so that it is assured that the keys
  // for that time are still there — or the request will fail.
  util.hlc.Timestamp target_time = 2 [(gogoproto.nullable) = false];

  bool enable_time_bound_iterator_optimization = 3;

  // IgnoreGcThreshold can be set by a caller to ignore the target-time when
  // checking that the earliest time at which the command operates is above the
  // GC threshold. This is safe to set only in very specific situations, such as
  // when the target span was OFFLINE since the target time as it is during
  // IMPORT INTO. In this case, since the IMPORT knows it is the only writer and
  // it only writes new keys, no keys to which it would need to revert have been
  // shadowed / could have been GC'ed, so it can safely ignore the GC threshold.
  bool ignore_gc_threshold = 4;
}

// A RevertRangeResponse is the return value from the RevertRange() method.
message RevertRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// ScanFormat is an enumeration of the available response formats for MVCCScan
// operations.
enum ScanFormat {
  option (gogoproto.goproto_enum_prefix) = false;

  // The standard MVCCScan format: a slice of KeyValue messages.
  KEY_VALUES = 0;
  // The batch_response format: a byte slice of alternating keys and values,
  // each prefixed by their length as a varint.
  BATCH_RESPONSE = 1;
}


// A ScanRequest is the argument to the Scan() method. It specifies the
// start and end keys for an ascending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ScanRequest {
  reserved 2, 3;

  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The desired format for the response. If set to BATCH_RESPONSE, the server
  // will set the batch_responses field in the ScanResponse instead of the rows
  // field.
  ScanFormat scan_format = 4;

  // The desired key-level locking mode used during this scan. When set to None
  // (the default), no key-level locking mode is used - meaning that the scan
  // does not acquire any locks. When set to any other strength, a lock of that
  // strength is acquired with the Unreplicated durability (i.e. best-effort) on
  // each of the keys scanned by the request, subject to any key limit applied
  // to the batch which limits the number of keys returned.
  //
  // NOTE: the locks acquire with this strength are point locks on each of the
  // keys returned by the request, not a single range lock over the entire span
  // scanned by the request.
  kv.kvserver.concurrency.lock.Strength key_locking = 5;
}

// A ScanResponse is the return value from the Scan() method.
message ScanResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Empty if no rows were scanned.
  repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
  // The intent rows seen when performing a scan at the READ_UNCOMMITTED
  // consistency level. These rows do not count against the MaxSpanRequestKeys
  // count.
  //
  // NOTE: this field is not currently populated with intents for deletion
  // tombstones. It probably should be because the rows field may contain
  // key-values that are being deleted by corresponding intents. We should
  // revisit this decision if this ever becomes a problem.
  repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];

  // If set, each item in this repeated bytes field contains part of the results
  // in batch format - the key/value pairs are a buffer of varint-prefixed
  // slices, alternating from key to value. Each entry in this field is
  // complete - there are no key/value pairs that are split across more than one
  // entry. There are num_keys total pairs across all entries, as defined by the
  // ResponseHeader. If set, rows will not be set and vice versa.
  repeated bytes batch_responses = 4;
}

// A ReverseScanRequest is the argument to the ReverseScan() method. It specifies the
// start and end keys for a descending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ReverseScanRequest {
  reserved 2, 3;

  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The desired format for the response. If set to BATCH_RESPONSE, the server
  // will set the batch_responses field in the ScanResponse instead of the rows
  // field.
  ScanFormat scan_format = 4;

  // The desired key-level locking mode used during this scan. When set to None
  // (the default), no key-level locking mode is used - meaning that the scan
  // does not acquire any locks. When set to any other strength, a lock of that
  // strength is acquired with the Unreplicated durability (i.e. best-effort) on
  // each of the keys scanned by the request, subject to any key limit applied
  // to the batch which limits the number of keys returned.
  //
  // NOTE: the locks acquire with this strength are point locks on each of the
  // keys returned by the request, not a single range lock over the entire span
  // scanned by the request.
  kv.kvserver.concurrency.lock.Strength key_locking = 5;
}

// A ReverseScanResponse is the return value from the ReverseScan() method.
message ReverseScanResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Empty if no rows were scanned.
  repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
  // The intent rows seen when performing a scan at the READ_UNCOMMITTED
  // consistency level. These rows do not count against the MaxSpanRequestKeys
  // count.
  //
  // NOTE: this field is not currently populated with intents for deletion
  // tombstones. It probably should be because the rows field may contain
  // key-values that are being deleted by corresponding intents. We should
  // revisit this decision if this ever becomes a problem.
  repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];

  // If set, each item in this repeated bytes field contains part of the results
  // in batch format - the key/value pairs are a buffer of varint-prefixed
  // slices, alternating from key to value. Each entry in this field is
  // complete - there are no key/value pairs that are split across more than one
  // entry. There are num_keys total pairs across all entries, as defined by the
  // ResponseHeader. If set, rows will not be set and vice versa.
  repeated bytes batch_responses = 4;
}


enum ChecksumMode {
    // CHECK_VIA_QUEUE is set for requests made from the consistency queue. In
    // this mode, a full check is carried out, and depending on the result a
    // recursive consistency check is triggered:
    //
    // 1. no inconsistency found: if recomputed stats don't match persisted stats,
    //    trigger a RecomputeStatsRequest.
    // 2. inconsistency found: if a diff is available, print it and trigger fatal
    //    error. If no diff found, trigger recursive check with diff requested
    //    (which then triggers fatal error).
    //
    // TODO(tbg): these semantics are an artifact of how consistency checks were
    // first implemented. The extra behavior here should move to the consistency
    // check queue instead and this option dropped from the enum.
    CHECK_VIA_QUEUE = 0;
    // CHECK_FULL recomputes the hash of the replicate data in all replicas and
    // uses this to determine whether there is an inconsistency.
    CHECK_FULL = 1;
    // CHECK_STATS only hashes the persisted lease applied state (which notably
    // includes the persisted MVCCStats) only. This catches a large class of
    // replica inconsistencies observed in the wild (where replicas apply a
    // nonidentical log of commands, and as a result almost always have
    // divergent stats), while doing work independent of the size of the data
    // contained in the replicas.
    CHECK_STATS = 2;
}

// A CheckConsistencyRequest is the argument to the CheckConsistency() method.
// It specifies the start and end keys for a span of ranges to which a
// consistency check should be applied. A consistency check on a range involves
// running a ComputeChecksum on the range followed by a storage.CollectChecksum.
message CheckConsistencyRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // log a diff of inconsistencies if such inconsistencies are found. This is only
  // valid if mode == FROM_QUEUE
  bool with_diff = 2;
  ChecksumMode mode = 3;
  // Whether to create a RocksDB checkpoint on each replica at the log position
  // at which the SHA is computed. The checkpoint is essentially a cheap point-
  // in-time backup of the database. It will be put into the engines' auxiliary
  // directory and needs to be removed manually to avoid leaking disk space.
  bool checkpoint = 4;
  // A list of nodes that the consistency check wants to terminate. This is
  // typically set when Checkpoint above is also set, as part of a second round
  // after a first consistency check that did find a divergence. The second
  // round is concerned with damage control and wants the nodes it suspects hold
  // anomalous data to be shut down, so that this data isn't served to clients
  // (or worse, spread to other replicas).
  repeated ReplicaDescriptor terminate = 5 [(gogoproto.nullable) = false];
}

// A CheckConsistencyResponse is the return value from the CheckConsistency() method.
// It returns the status the range was found in.
message CheckConsistencyResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  enum Status {
    // No inconsistency was detected, but not all replicas returned a checksum.
    RANGE_INDETERMINATE = 0;
    // A definite inconsistency was detected.
    RANGE_INCONSISTENT = 1;
    // All replicas of the range agreed on the checksum.
    RANGE_CONSISTENT = 2;
    // Like RANGE_CONSISTENT, but the recomputed stats disagreed with the
    // persisted stats. The persisted stats indicates estimates, so this is
    // expected.
    RANGE_CONSISTENT_STATS_ESTIMATED = 3;
    // Like RANGE_CONSISTENT_STATS_ESTIMATED, but the mismatch occurred with
    // persisted stats that claimed to be accurate. This is unexpected and
    // likely indicates a bug in our logic to incrementally update the stats
    // as commands are evaluated and applied.
    RANGE_CONSISTENT_STATS_INCORRECT = 4;
  }

  message Result {
    int64 range_id = 1 [(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
    // start_key of the range corresponding to range_id (at the time of the
    // check). This is useful to send additional requests to only a subset of
    // ranges contained within a result later, as requests can only be routed by
    // key.
    bytes start_key = 2;
    Status status = 3;
    // detail contains information related to the operation. If no inconsistency
    // is found, it contains informational value such as observed stats. If an
    // inconsistency is found, it contains information about that inconsistency
    // including the involved replica and, if requested, the diff.
    string detail = 4;
  }

  // result contains a Result for each Range checked, in no particular order.
  repeated Result result = 2 [(gogoproto.nullable) = false];
}

// An RecomputeStatsRequest triggers a stats recomputation on the Range addressed by
// the request.
//
// An error will be returned if the start key does not match the start key of the
// target Range.
//
// The stats recomputation touches essentially the whole range, but the command
// avoids having to block other commands by taking care to not interleave
// with splits, and by using the commutativity of stats updates. As a result,
// it is safe to invoke at any time, including repeatedly, though it should be
// used conservatively due to performing a full scan of the Range.
message RecomputeStatsRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // When dry_run is true, the stats delta is computed, but no stats adjustment
  // is performed. This isn't useful outside of testing since RecomputeStats is
  // safe and idempotent.
  bool dry_run = 2;
}

// An RecomputeStatsResponse is the response to an RecomputeStatsRequest.
message RecomputeStatsResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // added_delta is the adjustment made to the range's stats, i.e. `new_stats = old_stats + added_delta`.
  storage.enginepb.MVCCStatsDelta added_delta = 2 [(gogoproto.nullable) = false];
}

// An EndTxnRequest is the argument to the EndTxn() method. It specifies
// whether to commit or roll back an extant transaction.
message EndTxnRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // False to abort and rollback.
  bool commit = 2;
  // If set, deadline represents the maximum (exclusive) timestamp at which the
  // transaction can commit (i.e. the maximum timestamp for the txn's reads and
  // writes).
  // If EndTxn(Commit=true) finds that the txn's timestamp has been pushed above
  // this deadline, an error will be returned and the client is supposed to
  // rollback the txn.
  util.hlc.Timestamp deadline = 3;
  // commit triggers. Note that commit triggers are for
  // internal use only and will cause an error if requested through the
  // external-facing KV API.
  InternalCommitTrigger internal_commit_trigger = 4;
  // Set of spans that the transaction has acquired locks within. These are
  // spans which must be resolved on txn completion. Note that these spans
  // may be condensed to cover aggregate spans if the keys locked by the
  // transaction exceeded a size threshold.
  //
  // The set logically extends to include the keys of all writes in the
  // in-flight write set. However, those keys are not stored in this set
  // to avoid duplication. This means that elements that are removed from
  // that set should be merged into this one.
  //
  // The slice is maintained in sorted order and all spans are maximally
  // merged such that no two spans here overlap each other.
  repeated Span lock_spans = 5 [(gogoproto.nullable) = false];
  // Set of in-flight intent writes that have been issued by the transaction but
  // which may not have succeeded yet. If any promised writes are provided, a
  // committing EndTxn request will move a PENDING transaction to the STAGING
  // status instead of the COMMITTED status. These in-flight writes must then
  // all be confirmed as successful before the transaction can be moved from
  // STAGING to COMMITTED. For more, see txnCommitter.
  //
  // The slice is maintained in sorted order by sequence number. This provides
  // O(log n) access to individual writes in this set based on their sequence
  // number. See SequencedWriteBySeq.Find and its uses. The set can contain
  // multiple SequencedWrites with the same key, but all sequence numbers are
  // unique.
  repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false];
  // Requires that the transaction completes as a 1 phase commit. This
  // guarantees that all writes are to the same range and that no
  // intents are left in the event of an error.
  //
  // Note(andrei): Use this flag with care; retriable errors are not generated
  // reliably for these transactions - a TransactionStatusError might be
  // returned instead if 1PC execution fails.
  bool require_1pc = 6 [(gogoproto.customname) = "Require1PC"];
  // True to indicate that lock spans should be resolved with poison=true.
  // This is used when the transaction is being aborted independently of the
  // main thread of client operation, as in the case of an asynchronous abort
  // from the TxnCoordSender on a failed heartbeat. It should only be set to
  // true when commit=false.
  bool poison = 9;
  reserved 7, 8, 10;
}

// An EndTxnResponse is the return value from the EndTxn() method. The final
// transaction record is returned as part of the response header. In particular,
// transaction status and timestamp will be updated to reflect final committed
// values. Clients may propagate the transaction timestamp as the final txn
// commit timestamp in order to preserve causal ordering between subsequent
// transactions.
message EndTxnResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  reserved 2;
  reserved 3;
  // True if the transaction committed on the one phase commit path.
  // This means that all writes which were part of the transaction
  // were written as a single, atomic write batch to just one range.
  bool one_phase_commit = 4;
  // The commit timestamp of the STAGING transaction record written
  // by the request. Only set if the transaction record was staged.
  util.hlc.Timestamp staging_timestamp = 5 [(gogoproto.nullable) = false];
}

// An AdminSplitRequest is the argument to the AdminSplit() method. The
// existing range which contains header.key is split by
// split_key. If split_key is not specified, then this method will
// determine a split key that is roughly halfway through the
// range. The existing range is resized to cover only its start key to
// the split key. The new range created by the split starts at the
// split key and extends to the original range's end key. If split_key
// is known, header.key should also be set to split_key.
//
// New range IDs for each of the split range's replica and a new Raft
// ID are generated by the operation. Split requests are done in the
// context of a distributed transaction which updates range addressing
// records, range metadata and finally, provides a commit trigger to
// update bookkeeping and instantiate the new range on commit.
//
// The new range contains range replicas located on the same stores;
// no range data is moved during this operation. The split can be
// thought of as a mostly logical operation, though some other
// metadata (e.g. abort span and range stats must be copied or
// recomputed).
//
// expiration_time represents the time that this split expires. Any split that
// is not expired will not be considered for automatic merging by the merge
// queue. Any split requested by the split queue will have an expiration time
// of hlc.Timestamp{} (I.E. The zero timestamp so they are always eligible for
// automatic merging).
message AdminSplitRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  bytes split_key = 2 [(gogoproto.casttype) = "Key"];
  reserved 3;
  util.hlc.Timestamp expiration_time = 4 [(gogoproto.nullable) = false];

  // PredicateKeys specifies keys which if not contained within the range should
  // cause the split to be rejected. This can be used by a caller to effectively
  // send a "conditional split" request, i.e. a split if not already split.
  repeated bytes predicate_keys = 5 [(gogoproto.casttype) = "Key"];
}

// An AdminSplitResponse is the return value from the AdminSplit()
// method.
message AdminSplitResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An AdminUnsplitRequest is the argument to the AdminUnsplit()
// method. The sticky bit of the existing range whose starting key is
// header.key is removed.
//
// Ranges that do not have the sticky bit set are eligible for
// automatic merging.
message AdminUnsplitRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An AdminUnsplitResponse is the return value from the
// AdminUnsplit() method.
message AdminUnsplitResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An AdminMergeRequest is the argument to the AdminMerge() method. A
// merge is performed by calling AdminMerge on the left-hand range of
// two consecutive ranges (i.e. the range which contains keys which
// sort first). This range will be the subsuming range and the right
// hand range will be subsumed. After the merge operation, the
// subsumed range will no longer exist and the subsuming range will
// now encompass all keys from its original start key to the end key
// of the subsumed range. If AdminMerge is called on the final range
// in the key space, it is a noop.
// The request must be addressed to the start key of the left hand side.
message AdminMergeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An AdminMergeResponse is the return value from the AdminMerge()
// method.
message AdminMergeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// An AdminTransferLeaseRequest is the argument to the AdminTransferLease()
// method. A lease transfer allows an external entity to control the lease
// holder for a range. The target of the lease transfer needs to be a valid
// replica of the range.
message AdminTransferLeaseRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  int32 target = 2 [(gogoproto.casttype) = "StoreID"];
}

message AdminTransferLeaseResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A ReplicationChange specifies the type and target of a replication change operation.
message ReplicationChange {
  ReplicaChangeType change_type = 1;
  ReplicationTarget target = 2 [(gogoproto.nullable) = false];
}

// An AdminChangeReplicasRequest is the argument to the AdminChangeReplicas()
// method. A change replicas operation allows adding or removing a set of
// replicas for a range.
message AdminChangeReplicasRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Never access directly: use .Changes()
  //
  // TODO(tbg): remove in 20.1
  ReplicaChangeType deprecated_change_type = 2;
  // Never access directly: use .Changes()
  //
  // TODO(tbg): remove in 20.1
  repeated ReplicationTarget deprecated_targets = 3 [(gogoproto.nullable) = false];
  // ExpDesc is the expected current range descriptor to modify. If the range
  // descriptor is not identical to ExpDesc for the request will fail.
  //
  // If there is more than one change specified in targets, this expectation
  // will be applied to the first change and subsequent changes will use the
  // resultant descriptor from successfully applying the previous change.
  // If a change with more than one target occurs concurrently with another
  // it is possible that an error will occur after partial application of the
  // change. Changes are applied in the order they appear in the request.
  RangeDescriptor exp_desc = 4 [(gogoproto.nullable) = false];

  // The changes to apply to exp_desc. Never access directly: use .Changes().
  //
  // TODO(tbg): rename to 'changes' in 20.1 and remove Changes().
  repeated ReplicationChange internal_changes = 5 [(gogoproto.nullable) = false];
}

message AdminChangeReplicasResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Desc is the value of the range descriptor upon success.
  RangeDescriptor desc = 2 [(gogoproto.nullable) = false];
}

// An AdminRelocateRangeRequest is the argument to the AdminRelocateRange()
// method. Relocates the replicas for a range to the specified target stores.
// The first store in the list of targets becomes the new leaseholder.
message AdminRelocateRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  repeated ReplicationTarget voter_targets = 2 [(gogoproto.nullable) = false];
  repeated ReplicationTarget non_voter_targets = 3 [(gogoproto.nullable) = false];
  // TODO(a-robinson): Add "reason"/"details" string fields?

  // As of 22.1 (specifically #74077), leaseholder replicas can remove
  // themselves from the range. This means that now, in a joint state, the
  // leaseholder that is removing itself chooses the best target replica to
  // transfer the lease to, all inside of AdminChangeReplicas.
  //
  // This means that the pre-22.1 contract of `AdminRelocateRange` to transfer
  // the lease to the first voter replica isn't required anymore. Only callers
  // that rely on this contract should set this attribute.
  bool transfer_lease_to_first_voter = 4;
  // TODO(aayush): Migration path:
  // 22.1: Send and consult the attribute.
  // 22.2: Send but don't consult the attribute.
  // 23.1: Stop sending or consulting the attribute. Remove this field.
  bool transfer_lease_to_first_voter_accurate = 5;
}

message AdminRelocateRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A HeartbeatTxnRequest is arguments to the HeartbeatTxn()
// method. It's sent by transaction coordinators to let the system
// know that the transaction is still ongoing. Note that this
// heartbeat message is different from the heartbeat message in the
// gossip protocol.
message HeartbeatTxnRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // NOTE: this could use a ClockTimestamp type, but doing so results in a
  // large diff that doesn't seem worth it, given that we never feed this
  // timestamp back into a clock.
  util.hlc.Timestamp now = 2 [(gogoproto.nullable) = false];
}

// A HeartbeatTxnResponse is the return value from the HeartbeatTxn()
// method. It returns the transaction info in the response header. The
// returned transaction lets the coordinator know the disposition of
// the transaction (i.e. aborted, committed, or pending).
message HeartbeatTxnResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A GCRequest is arguments to the GC() method. It's sent by range
// lease holders after scanning range data to find expired MVCC values.
message GCRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  message GCKey {
    bytes key = 1 [(gogoproto.casttype) = "Key"];
    util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
  }
  repeated GCKey keys = 3 [(gogoproto.nullable) = false];
  // Threshold is the expiration timestamp.
  util.hlc.Timestamp threshold = 4 [(gogoproto.nullable) = false];

  reserved 5;
}

// A GCResponse is the return value from the GC() method.
message GCResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// PushTxnType determines what action to take when pushing a transaction.
enum PushTxnType {
  option (gogoproto.goproto_enum_prefix) = false;

  // Push the timestamp forward if possible to accommodate a concurrent reader.
  PUSH_TIMESTAMP = 0;
  // Abort the transaction if possible to accommodate a concurrent writer.
  PUSH_ABORT = 1;
  // Abort the transaction if it's abandoned, but don't attempt to mutate it
  // otherwise.
  PUSH_TOUCH = 2;

  reserved 3;
}

// A PushTxnRequest is arguments to the PushTxn() method. It's sent by
// readers or writers which have encountered an "intent" laid down by
// another transaction. The goal is to resolve the conflict. Note that
// args.Key should be set to the txn ID of args.PusheeTxn, not
// args.PusherTxn. This RPC is addressed to the range which owns the pushee's
// txn record.
//
// Resolution is trivial if the txn which owns the intent has either
// been committed or aborted already. Otherwise, the existing txn can
// either be aborted (for write/write conflicts), or its commit
// timestamp can be moved forward (for read/write conflicts). The
// course of action is determined by the specified push type, and by
// the owning txn's status and priority.
message PushTxnRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Transaction which encountered the intent, if applicable. For a
  // non-transactional pusher, pusher_txn will only have the priority set (in
  // particular, ID won't be set). Used to compare priorities and timestamps if
  // priorities are equal.
  Transaction pusher_txn = 2 [(gogoproto.nullable) = false];
  // Transaction to be pushed, as specified at the intent which led to
  // the push transaction request. Note that this may not be the most
  // up-to-date value of the transaction record, but will be set or
  // merged as appropriate.
  storage.enginepb.TxnMeta pushee_txn = 3 [(gogoproto.nullable) = false];
  // PushTo is the timestamp which PusheeTxn should be pushed to. During
  // conflict resolution, it should be set just after the timestamp of the
  // conflicting read or write.
  util.hlc.Timestamp push_to = 4 [(gogoproto.nullable) = false];
  // Readers set this to PUSH_TIMESTAMP to move pushee_txn's provisional
  // commit timestamp forward. Writers set this to PUSH_ABORT to request
  // that pushee_txn be aborted if possible. Inconsistent readers set
  // this to PUSH_TOUCH to determine whether the pushee can be aborted
  // due to inactivity (based on the now field).
  PushTxnType push_type = 6;
  // Forces the push by overriding the normal expiration and priority checks
  // in PushTxn to either abort or push the timestamp.
  bool force = 7;

  reserved 5, 8, 9;
}

// A PushTxnResponse is the return value from the PushTxn() method. It
// returns success and the resulting state of PusheeTxn if the
// conflict was resolved in favor of the caller; the caller should
// subsequently invoke ResolveIntent() on the conflicted key. It
// returns an error otherwise.
message PushTxnResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // pushee_txn is non-nil if the transaction was pushed and contains
  // the current value of the transaction.
  // TODO(tschottdorf): Maybe this can be a TxnMeta instead; probably requires
  // factoring out the new Priority.
  Transaction pushee_txn = 2 [(gogoproto.nullable) = false];
}

// A RecoverTxnRequest is arguments to the RecoverTxn() method. It is sent
// during the recovery process for a transaction abandoned in the STAGING state.
// The sender is expected to have queried all of the abandoned transaction's
// in-flight writes and determined whether they all succeeded or not. This is
// used to determine whether the result of the recovery should be committing the
// abandoned transaction or aborting it.
message RecoverTxnRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Transaction record to recover.
  storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
  // Did all of the STAGING transaction's writes succeed? If so, the transaction
  // is implicitly committed and the commit can be made explicit by giving its
  // record a COMMITTED status. If not, the transaction can be aborted as long
  // as a write that was found to have failed was prevented from ever succeeding
  // in the future.
  bool implicitly_committed = 3;
}

// A RecoverTxnResponse is the return value from the RecoverTxn() method.
message RecoverTxnResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // Contains the finalized state of the recovered transaction.
  Transaction recovered_txn = 2 [(gogoproto.nullable) = false];
}

// A QueryTxnResponse is arguments to the QueryTxn() method. It's sent
// by transactions which are waiting to push another transaction because
// of conflicting write intents to fetch updates to either the pusher's
// or the pushee's transaction records.
message QueryTxnRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Transaction record to query.
  storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
  // If true, the query will not return until there are changes to either the
  // transaction status or priority -OR- to the set of dependent transactions.
  bool wait_for_update = 3;
  // Set of known dependent transactions.
  repeated bytes known_waiting_txns = 4 [(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
}

// A QueryTxnResponse is the return value from the QueryTxn() method.
message QueryTxnResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Contains the current state of the queried transaction. If the queried
  // transaction record does not exist, this will be empty.
  Transaction queried_txn = 2 [(gogoproto.nullable) = false];
  // txn_record_exists is set if the queried_txn comes from a transaction record
  // read from the database. If not set, then the txn record was "synthesized".
  //
  // The field only started being populated in 20.2, so 20.1 nodes never set it.
  bool txn_record_exists = 4;
  // Specifies a list of transaction IDs which are waiting on the txn.
  repeated bytes waiting_txns = 3 [(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
}

// A QueryIntentRequest is arguments to the QueryIntent() method. It visits
// the specified key and checks whether an intent is present for the given
// transaction. If the intent is found to be missing then it is prevented
// from ever being written in the future.
message QueryIntentRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The TxnMeta that the intent is expected to have. Specifically, whether an
  // intent is a match or not is defined as whether an intent exists that could
  // be committed by the provided transaction. If an intent is found at the
  // specified key, the intent is only considered a match if it has the same ID,
  // the same epoch, and a write timestamp that is equal to or less than that in
  // the provided transaction.
  //
  // Additionally, the intent is only considered a match if its sequence number
  // is equal to or greater than the expected txn's sequence number. The
  // requests doesn't require an exact sequence number match because the
  // transaction could have performed overlapping writes, in which case only the
  // latest sequence number will remain. We assume that if a transaction has
  // successfully written an intent at a larger sequence number then it must
  // have succeeded in writing an intent at the smaller sequence number as
  // well.
  //
  // QueryIntentRequests may be issued in non-transactional BatchRequests or in
  // transactional BatchRequests. If issued inside of a transaction, the TxnMeta
  // must be a reference to the same transaction as the batch's transaction, or
  // the request will be rejected. In other words, a transaction can only query
  // its own intents. In these cases where the BatchRequest is transactional,
  // the TxnMeta's write timestamp is forwarded by the write timestamp of the
  // request header transaction for purposes of determining whether a matching
  // intent is found or not (see condition above). This is useful to avoid the
  // need to update each QueryIntentRequest when a transaction is querying its
  // own intent after having successfully refreshed.
  storage.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];

  // If true, return an IntentMissingError if a matching intent is not found.
  // Special-cased to return a SERIALIZABLE retry error if a SERIALIZABLE
  // transaction queries its own intent and finds it has been pushed.
  bool error_if_missing = 3;
}

// A QueryIntentResponse is the return value from the QueryIntent() method.
message QueryIntentResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // Whether an intent matching the expected transaction was found at the key.
  bool found_intent = 2;
}

// A ResolveIntentRequest is arguments to the ResolveIntent()
// method. It is sent by transaction coordinators after success
// calling PushTxn to clean up write intents: either to remove, commit
// or move them forward in time.
message ResolveIntentRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // The transaction whose intent is being resolved.
  storage.enginepb.TxnMeta intent_txn = 2 [(gogoproto.nullable) = false];
  // The status of the transaction.
  TransactionStatus status = 3;
  // Optionally poison the abort span for the transaction on the intent's range.
  // The field is ignored if status != ABORTED (i.e. only intents from ABORTED
  // txns ever poison the abort spans).
  bool poison = 4;
  // The list of ignored seqnum ranges as per the Transaction record.
  repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 5 [
    (gogoproto.nullable) = false,
    (gogoproto.customname) = "IgnoredSeqNums"
  ];
  // An optional clock observation from the intent's leaseholder node that was
  // captured at some point before the intent's transaction was pushed and found
  // to be PENDING. The clock observation is used to forward the intent's local
  // timestamp during intent resolution. If the clock observation was captured
  // from a different node than the node which evaluates the ResolveIntent
  // request, it will be ignored and the intent's local timestamp will not be
  // changed.
  ObservedTimestamp clock_while_pending = 6 [(gogoproto.nullable) = false];
}

// A ResolveIntentResponse is the return value from the
// ResolveIntent() method.
message ResolveIntentResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A ResolveIntentRangeRequest is arguments to the ResolveIntentRange() method.
// It is sent by transaction coordinators after success calling PushTxn to
// clean up write intents: either to remove, commit or move them forward in
// time.
message ResolveIntentRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // The transaction whose intents are being resolved.
  storage.enginepb.TxnMeta intent_txn = 2 [(gogoproto.nullable) = false];
  // The status of the transaction.
  TransactionStatus status = 3;
  // Optionally poison the abort span for the transaction on all ranges on which
  // the intents reside. The field is ignored if status != ABORTED (i.e. only
  // intents from ABORTED txns ever poison the abort spans).
  bool poison = 4;
  // The minimum timestamp for any intents written by this
  // transaction. If present, this value can be used to optimize the
  // iteration over the span to find intents to resolve.
  util.hlc.Timestamp min_timestamp = 5 [(gogoproto.nullable) = false];
  // The list of ignored seqnum ranges as per the Transaction record.
  repeated storage.enginepb.IgnoredSeqNumRange ignored_seqnums = 6 [
    (gogoproto.nullable) = false,
    (gogoproto.customname) = "IgnoredSeqNums"
  ];
  // An optional clock observation from the intent's leaseholder node that was
  // captured at some point before the intent's transaction was pushed and found
  // to be PENDING. The clock observation is used to forward the intent's local
  // timestamp during intent resolution. If the clock observation was captured
  // from a different node than the node which evaluates the ResolveIntent
  // request, it will be ignored and the intent's local timestamp will not be
  // changed.
  ObservedTimestamp clock_while_pending = 7 [(gogoproto.nullable) = false];
}

// A ResolveIntentRangeResponse is the return value from the
// ResolveIntent() method.
message ResolveIntentRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A MergeRequest contains arguments to the Merge() method. It
// specifies a key and a value which should be merged into the
// existing value at that key.
message MergeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Value value = 2 [(gogoproto.nullable) = false];
}

// MergeResponse is the response to a Merge() operation.
message MergeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// TruncateLogRequest is used to remove a prefix of the raft log. While there
// is no requirement for correctness that the raft log truncation be synchronized across
// replicas, it is nice to preserve the property that all replicas of a range are as close
// to identical as possible. The raft leader can also inform decisions about the cutoff point
// with its knowledge of the replicas' acknowledgment status.
message TruncateLogRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // Log entries < this index are to be discarded.
  uint64 index = 2;

  // RangeID is used to double check that the correct range is being truncated.
  // The header specifies a span, start and end keys, but not the range id
  // itself. The range may have changed from the one specified in the header
  // in the case of a merge.
  int64 range_id = 3 [(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];

  // ExpectedFirstIndex is the expected Index of the last TruncateLogRequest,
  // i.e., we expect that this request will typically be truncating entries
  // [ExpectedFirstIndex, Index).
  //
  // There is no correctness issue if the replica applying this truncation has
  // not seen the preceding TruncateLogRequest or has seen one with an Index
  // that is not equal to ExpectedFirstIndex. This is an optimization that
  // typically allows the potentially expensive computation of the bytes being
  // discarded from the raft log to be performed once, at the leaseholder.
  //
  // Populated starting at cluster version LooselyCoupledRaftLogTruncation.
  uint64 expected_first_index = 4;
}

// TruncateLogResponse is the response to a TruncateLog() operation.
message TruncateLogResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A RequestLeaseRequest is arguments to the RequestLease()
// method. It is sent by the store on behalf of one of its ranges upon receipt
// of a command requiring a lease when none is found.
message RequestLeaseRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Lease lease = 2 [(gogoproto.nullable) = false];
  // The previous lease is specified by the caller to verify
  // it has not changed when executing this command.
  Lease prev_lease = 3 [(gogoproto.nullable) = false];
  // The MinLeaseProposedTS of the proposing replica to make sure that leases
  // issued after a node restart receive a new sequence number (instead of
  // counting as a lease extension). See #23204.
  util.hlc.Timestamp min_proposed_ts = 4 [(gogoproto.customname) = "MinProposedTS",
    (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/util/hlc.ClockTimestamp"];
}

// A TransferLeaseRequest represents the arguments to the TransferLease()
// method. It is sent by a replica that currently holds the range lease and
// wants to transfer it away.
//
// Like a RequestLeaseRequest, this request has the effect of instituting a new
// lease. The difference is that the new lease is allowed to overlap the
// existing one. It is a separate request because the RequestLeaseRequest is
// special - it's not subject to the same replay protection restrictions as
// other requests, instead being protected from replays by the fact that leases
// are not generally allowed to overlap. The TransferLeaseRequest is not
// special in this respect (for example, the proposer of this command is
// checked to have been holding the lease when the proposal was made).
message TransferLeaseRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Lease lease = 2 [(gogoproto.nullable) = false];
  // The previous lease is specified by the caller to verify
  // it has not changed when executing this command.
  Lease prev_lease = 3 [(gogoproto.nullable) = false];
}

// LeaseInfoRequest is the argument to the LeaseInfo() method, for getting
// information about a range's lease.
// It's a point request, so it addresses one single range, and returns the lease
// currently in effect for that range. This request is commonly set with
// ReadConsistency=INCONSISTENT in order for the request to be served by the
// node to whom the request was sent.
message LeaseInfoRequest{
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// LeaseInfoResponse is the response to a LeaseInfo() operation.
message LeaseInfoResponse{
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // The last lease known by the replica serving the request. It can also be the
  // tentative future lease, if a lease transfer is in progress.
  Lease lease = 2 [(gogoproto.nullable) = false];
  // current_lease is set if `lease` represents a tentative future lease. In
  // that case, current_lease represents the lease that's currently in effect.
  Lease current_lease = 3;
  // evaluated_by returns the store that evaluated this request. This
  // corresponds to the leaseholder unless ReadConsistency=INCONSISTENT was
  // used. The response reflects the evaluator's view of the lease. When the
  // client cares to see a particular node's view, it can use this field to
  // check whether the node it intended query (by sending the request to that
  // node and using ReadConsistency=INCONSISTENT) indeed served it - it's
  // possible that even if ReadConsistency=INCONSISTENT was used, the request is
  // still not evaluated by the node it was sent to if that node's replica is a
  // learner or the node doesn't have a replica at all.
  int32 evaluated_by = 4 [(gogoproto.casttype) = "StoreID"];
}

// A RequestLeaseResponse is the response to a RequestLease() or TransferLease()
// operation.
message RequestLeaseResponse{
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// A ComputeChecksumRequest is arguments to the ComputeChecksum() method, to
// start computing the checksum for the specified range at the snapshot for this
// request command. A response is returned without the checksum. The computed
// checksum is retrieved via a storage.CollectChecksumRequest.
message ComputeChecksumRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  // The version used to pick the checksum method. It allows us to use a
  // consistent checksumming method across replicas.
  uint32 version = 2;
  reserved 3;
  // Compute a checksum along with a snapshot of the entire range, that will be
  // used in logging a diff during checksum verification.
  bool snapshot = 4;
  // The type of checksum to compute. See ChecksumMode.
  ChecksumMode mode = 5;
  // If set, a checkpoint (i.e. cheap backup) of the engine will be taken. This
  // is expected to be set only if we already know that there is a problem and
  // we want to preserve as much state as possible. The checkpoint will be stored
  // in the engine's auxiliary directory.
  bool checkpoint = 6;
  // If non-empty, specifies the replicas which are the most likely source of the
  // inconsistency. After evaluating the command, these replicas will terminate.
  //
  // See the field of the same name in CheckConsistencyRequest for details.
  repeated ReplicaDescriptor terminate = 7 [(gogoproto.nullable) = false];
}

// A ComputeChecksumResponse is the response to a ComputeChecksum() operation.
message ComputeChecksumResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // ChecksumID is the unique identifier that can be used to get the computed
  // checksum in a future storage.CollectChecksumRequest.
  bytes checksum_id = 2 [(gogoproto.nullable) = false,
      (gogoproto.customname) = "ChecksumID",
      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
}

enum ExternalStorageProvider {
  Unknown = 0;
  nodelocal = 1;
  http = 2;
  s3 = 3;
  gs = 4;
  azure = 5;
  reserved 6;
  userfile = 7;
  null = 8;
}

message ExternalStorage {
  ExternalStorageProvider provider = 1;

  message LocalFilePath {
    string path = 1;
    uint32 node_id = 2 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
  }
  message Http {
    string baseUri = 1;
  }
  message S3 {
    string bucket = 1;
    string prefix = 2;

    string access_key = 3;
    string secret = 4;
    string temp_token = 5;
    string endpoint = 6;
    string region = 7;
    string auth = 8;
    string server_enc_mode  = 9;
    string server_kms_id = 10  [(gogoproto.customname) = "ServerKMSID"];
    string storage_class = 11;
  }
  message GCS {
    string bucket = 1;
    string prefix = 2;
    string auth = 3;

    // BillingProject if non-empty, is the Google Cloud project to bill for all storage requests.
    // This is required to be set if using a "requestor pays" bucket.
    string billing_project = 4;

    string credentials = 5;
  }
  message Azure {
    string container = 1;
    string prefix = 2;

    string account_name = 3;
    string account_key = 4;
  }
  message FileTable {
    // User interacting with the external storage. This is used to check access
    // privileges of the requested user scoped tables.
    // This field is really of type security.SQLUsername. We can't use
    // the type directly however because it would create a circular dependency.
    string user = 1;

    // QualifiedTableName specifies the database.schema.tablename which the
    // FileTableSystem should interact with when servicing reads/writes.
    string qualified_table_name = 2;

    // Path is the filename being read/written to via the FileTableSystem.
    string path = 3;
  }
  LocalFilePath LocalFile = 2 [(gogoproto.nullable) = false];
  Http HttpPath = 3 [(gogoproto.nullable) = false];
  GCS GoogleCloudConfig = 4;
  S3 S3Config = 5;
  Azure AzureConfig = 6;
  reserved 7;
  FileTable FileTableConfig = 8 [(gogoproto.nullable) = false];
}

// RetryTracingEvent is the trace recording used to track retries.
message RetryTracingEvent {
  string operation = 1;
  int32 attempt_number = 2;
  string retry_error = 3;
}

enum MVCCFilter {
  Latest = 0;
  All = 1;
}

// FileEncryptionOptions stores information needed by KV level requests (egs:
// ExportRequest) to encrypt or decrypt data.
message FileEncryptionOptions {
  // Key specifies the key to use for encryption or decryption.
  bytes key = 1;
}

// ExportRequest is the argument to the Export() method, to dump a keyrange into
// files under a basepath.
message ExportRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  ExternalStorage storage = 2 [(gogoproto.nullable) = false];

  // ResumeKeyTS allows export request to resume at arbitrary key timestamp. This
  // value doesn't affect export bounds, but ensures that no keys are skipped or
  // duplicated when previous request doesn't complete fully and returned EndKeyTS
  // together with resume span.
  util.hlc.Timestamp resume_key_ts = 12 [
      (gogoproto.nullable) = false,
      (gogoproto.customname) = "ResumeKeyTS"
  ];

  // MVCCFilter determines if request exports all data or latest data as of
  // Timestamp specified in request header. If all data is requested StartTime
  // is used as a lower bound and header Timestamp as higher bound or exported
  // entries.
  MVCCFilter mvcc_filter = 4 [(gogoproto.customname) = "MVCCFilter"];
  // StartTime is only used when MVCCFilter is set to All.
  util.hlc.Timestamp start_time = 3 [(gogoproto.nullable) = false];

  // Split large rows in the middle of key sequence. This option will allow
  // large history being broken up into target_file_size chunks and prevent
  // blowing up on memory usage. This option is only allowed together with
  // return_sst since caller should reconstruct full tables.
  bool split_mid_key = 13;

  // Return the exported SST data in the response.
  bool return_sst = 5 [(gogoproto.customname) = "ReturnSST"];

  // EnableTimeBoundIteratorOptimization, if true, enables a performance
  // optimization that allows us to entirely skip over sstables in RocksDB that
  // don't have data relevant to the time bounds in this request.
  //
  // This can have a dramatic impact on performance, but we've seen a number of
  // extremely subtle and hard to detect correctness issues with this (see
  // #28358 #34819). As a result, we've decided to skip the optimization
  // everywhere that it isn't absolutely necessary for the feature to work
  // (leaving one place: poller-based changefeeds, which are being phased out
  // anyway). This will both give increased confidence in correctness as well as
  // eliminate any need to investigate time-bound iterators when/if someone hits
  // a correctness bug.
  bool enable_time_bound_iterator_optimization = 7;
  // StorageByLocalityKV is a map of locality KVs to storage configurations. If
  // set, files will be written to the store that matches the most specific
  // locality KV in the map.
  map<string, ExternalStorage> storage_by_locality_kv = 8 [(gogoproto.customname) = "StorageByLocalityKV"];

  FileEncryptionOptions encryption = 9;

  // TargetFileSize is the byte size target for individual files in the
  // response. If the MVCCFilter is Latest, the returned files will only be
  // larger than this value if an individual KV pair is larger than this value.
  // If the MVCCFilter is All then the file may exceed this value by at most the
  // size of all versions of a single key. If TargetFileSize is non-positive
  // then there is no limit.
  int64 target_file_size = 10;

  // ReturnSSTBelowSize is the threshold which (if non-zero) causes files which
  // are below that size threshold to be returned, as if ReturnSST were set,
  // instead of being written to the assigned storage location.
  // Note: returned SSTs are never encrypted.
  int64 return_sst_below_size = 11;
}

// BulkOpSummary summarizes the data processed by an operation, counting the
// total size as well as number of entries processed in each index (from which
// row counts can be derived).
message BulkOpSummary {
  // DataSize is the sum of key and value lengths.
  int64 data_size = 1;
  // DeprecatedRows contained the row count when "rows" were always defined as
  // entries in the index with ID 1, however since 20.1 and the introduction of
  // PK changes, the low-level counters that produce BulkOpSummaries are unable
  // to assume which index is primary and thus cannot distinguish "rows" vs
  // "index entries". Callers wishing to get a "row count" from a BulkOpSummary
  // should use EntryCounts instead, fetching the count for the table/index that
  // corresponds to the PK.
  int64 deprecated_rows = 2;
  // DeprecatedIndexEntries contained the index entry count prior to 20.1. See
  // the comment on DeprecatedRows for details.
  int64 deprecated_index_entries = 3;

  reserved 4;
  // EntryCounts contains the number of keys processed for each tableID/indexID
  // pair, stored under the key (tableID << 32) | indexID. This EntryCount key
  // generation logic is also available in the BulkOpSummaryID helper.
  map<uint64, int64> entry_counts = 5;
}

// ExportResponse is the response to an Export() operation.
message ExportResponse {
  // File describes a keyrange that has been dumped to a file at the given
  // path.
  message File {
    Span span = 1 [(gogoproto.nullable) = false];
    util.hlc.Timestamp end_key_ts = 9 [
        (gogoproto.nullable) = false,
        (gogoproto.customname) = "EndKeyTS"
    ];
    string path = 2;
    reserved 3;
    reserved 4;
    reserved 5;

    BulkOpSummary exported = 6 [(gogoproto.nullable) = false];

    bytes sst = 7 [(gogoproto.customname) = "SST"];
    string locality_kv = 8 [(gogoproto.customname) = "LocalityKV"];
  }

  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  repeated File files = 2 [(gogoproto.nullable) = false];
  util.hlc.Timestamp start_time = 3 [(gogoproto.nullable) = false];
}

// AdminScatterRequest is the argument to the AdminScatter() method, which moves
// replicas and leaseholders for a selection of ranges. Scatter is best-effort;
// ranges that cannot be moved will include an error detail in the response and
// won't fail the request.
message AdminScatterRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  bool randomize_leases = 2;
}

// ScatterResponse is the response to a Scatter() operation.
message AdminScatterResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];


  message Range {
    Span span = 1 [(gogoproto.nullable) = false];
    reserved 2;
  }
  reserved 2;
  repeated RangeInfo range_infos = 3 [(gogoproto.nullable) = false];
}

// AdminVerifyProtectedTimestampRequest is the argument to the
// AdminVerifyProtectedTimestamp method which ensures that the specified record
// will be seen before data can be garbage collected at the timestamp.
message AdminVerifyProtectedTimestampRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // RecordID is the ID of the protected timestamp Record being verified.
  bytes record_id = 4 [
      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
      (gogoproto.nullable) = false,
      (gogoproto.customname) = "RecordID"
  ];

  // Protected is the timestamp at which the record with RecordID protects.
  util.hlc.Timestamp protected = 2 [(gogoproto.nullable) = false];

  // RecordAliveAt is a an hlc timestamp at which the record being verified is
  // known to exist. A value for RecordAliveAt is generally determined by
  // reading a Record from the database and using the timestamp at which that
  // read occurred.
  util.hlc.Timestamp record_alive_at = 3 [(gogoproto.nullable) = false];
}


// AdminVerifyProtectedTimestampResponse is the argument to the
// AdminVerifyProtectedTimestamp method which ensures that the specified record
// will be seen before data can be garbage collected at the timestamp.
message AdminVerifyProtectedTimestampResponse {

  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  bool verified = 2;

  // TODO(adityamaru): Remove in 21.2.
  repeated RangeDescriptor deprecated_failed_ranges = 3 [(gogoproto.nullable) = false];
  message FailedRange {
    int64 range_id = 1 [(gogoproto.customname) = "RangeID"];
    bytes start_key = 2 [(gogoproto.casttype) = "RKey"];
    bytes end_key = 3 [(gogoproto.casttype) = "RKey"];
    string reason = 4;
  }
  repeated FailedRange verification_failed_ranges = 4 [(gogoproto.nullable) = false];
}

// AddSSTableRequest contains arguments to the AddSSTable method, which links an
// SST file into the Pebble log-structured merge-tree. The SST must only contain
// committed versioned values with non-zero MVCC timestamps (no intents or
// inline values) and no tombstones. It cannot be used in a transaction, cannot
// be split across ranges, and must be alone in a batch.
// 
// By default, AddSSTable will blindly write the SST contents into Pebble, with
// fixed MVCC timestamps unaffected by pushes. This can violate many CRDB
// guarantees, including ACID, serializability and single-key linearizability:
// it mutates MVCC history (by replacing existing versions or writing below
// their timestamp) and does not respect the timestamp cache (by writing at
// timestamps that have already been read) nor the closed timestamp (by writing
// at immutable timestamps).
//
// The following parameters can be used to make AddSSTable enforce these
// guarantees, at a performance cost:
//
// * SSTTimestampToRequestTimestamp: ensures compliance with the timestamp cache
//   and closed timestamp, by rewriting SST timestamps to the request timestamp.
//   Also emits the SST via the range feed.
//
// * DisallowConflicts, DisallowShadowing, or DisallowShadowingBelow: ensures
//   compliance with MVCC, by checking for conflicting keys in existing data
//   instead of writing blindly.
//
// If the above parameters are not enabled, the caller must make sure these
// guarantees are upheld via other mechanisms. These options are orthogonal,
// providing different guarantees, and neither is sufficient by itself to
// enforce ACID guarantees -- they must both be enabled. See comments on these
// parameters for more details.
//
// AddSSTable always synchronizes with ongoing transactions, by taking out a
// lock span, scanning for separated intents, and resolving them. This is done
// even in the case of blind writes, since the caller is expected to make sure
// there are no ongoing writes to the ingested key span, so there should be few
// or no intents in the common case.
//
// If writing blindly (without e.g. DisallowConflicts), the range's MVCC stats
// may be incorrect as the SST stats are not adjusted for existing keys, so they
// will be marked with ContainsEstimates. The caller should recompute statistics
// after ingestion.
message AddSSTableRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  bytes data = 2;

  // SSTTimestampToRequestTimestamp gives the timestamp used for all MVCC keys
  // in the provided SST. If this timestamp differs from the request timestamp
  // (e.g. if the request gets pushed) then all MVCC keys in the SST will be
  // rewritten to the request timestamp during request evaluation. This ensures
  // the writes comply with the timestamp cache and closed timestamp. It also
  // causes the AddSSTable to be emitted via the range feed, since it respects
  // the closed timestamp.
  //
  // Callers should always set this, except in very special circumstances when
  // the timestamp cache and closed timestamp can safely be ignored (e.g.
  // streaming replication into an offline tenant).
  //
  // Note that this alone is not sufficient to guarantee MVCC correctness, since
  // it can write below or replace existing committed versions (the tscache is
  // only bumped when the values are subsequently read). Use DisallowConflicts
  // or DisallowShadowing in addition to guarantee MVCC correctness and
  // serializability.
  // 
  // Added in 22.1, so check the MVCCAddSSTable version gate before using.
  util.hlc.Timestamp sst_timestamp_to_request_timestamp = 6 [
    (gogoproto.customname) = "SSTTimestampToRequestTimestamp", (gogoproto.nullable) = false];

  // DisallowConflicts will check for MVCC conflicts with existing keys, i.e.
  // scan for existing keys with a timestamp at or above the SST key and
  // return WriteTooOldError (possibly retrying). It also ensures MVCC
  // statistics are accurately updated.
  //
  // Note that this alone is not sufficient to guarantee serializability or
  // single-key linearizability, since it can write to a timestamp that another
  // reader has already observed, changing the value at that timestamp and above
  // it. Use with SSTTimestampToRequestTimestamp to guarantee serializability.
  //
  // Added in 22.1, so check the MVCCAddSSTable version gate before using.
  //
  // TODO(erikgrinaker): It might be possible to avoid this parameter if we
  // could pick an MVCC timestamp that's guaranteed to not collide with
  // existing keys, see: https://github.com/cockroachdb/cockroach/issues/73047.
  // However, this would always lead to inaccurate MVCC stats.
  bool disallow_conflicts = 7; 

  // DisallowShadowing implies DisallowConflicts, and additionally rejects
  // writing above keys that have an existing/visible value (but will write
  // above tombstones).
  //
  // TODO(erikgrinaker): Consider removing this in 22.1 if all callers have
  // been migrated to DisallowShadowingBelow.
  bool disallow_shadowing = 3;

  // DisallowShadowingBelow implies DisallowConflicts, and additionally rejects
  // writing above keys that have an existing/visible value (but will write
  // above tombstones). Unlike DisallowShadowing, it allows shadowing keys
  // that have a timestamp at or above the given timestamp as long as the
  // value is identical to the existing value, and also allows idempotent writes
  // (same key/timestamp/value) at or above the given timestamp.
  //
  // This is a specialized method for the IMPORT INTO use-case, where we do not
  // want to shadow existing keys (which could cause them to be GCed before the
  // import finalizes, preventing a rollback), but we need to allow shadowing
  // keys that were previously written by the import itself in the case of a
  // resumption or retry. The equal value requirement is to avoid unique
  // constraint violations.
  //
  // If this parameter is used, the value of DisallowShadowing is ignored, so 
  // callers may pass both for forward and backwards compatibility.
  //
  // Added in 22.1, so check the MVCCAddSSTable version gate before using.
  util.hlc.Timestamp disallow_shadowing_below = 8 [(gogoproto.nullable) = false];

  // MVCCStats, if set, is the MVCCStats for the contents of this SSTable and is
  // used as-is during evaluation of the AddSSTable command to update the range
  // MVCCStats, instead of computing the stats for the SSTable by iterating it.
  // Including these stats can make the evaluation of AddSSTable much cheaper.
  storage.enginepb.MVCCStats mvcc_stats = 4 [(gogoproto.customname) = "MVCCStats"];

  // IngestAsWrites causes the content of the provided SSTable to be ingested in
  // a regular WriteBatch, instead of directly adding the provided SST to the
  // storage engine. This is useful if the data size is so small that the fixed
  // costs of adding an extra file (file IO, triggering a flush, compactions)
  // would be higher than the marginal costs of the amount of data going though
  // the usual write pipeline (on-disk raft log, WAL, etc).
  // TODO(dt): https://github.com/cockroachdb/cockroach/issues/34579#issuecomment-544627193
  bool ingest_as_writes = 5;
}

// AddSSTableResponse is the response to a AddSSTable() operation.
message AddSSTableResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// RefreshRequest is arguments to the Refresh() method, which verifies that no
// write has occurred since the refresh_from timestamp to the specified key.
// The timestamp cache is updated. A transaction must be supplied with this
// request. If the key has been written more recently than the provided txn
// timestamp, an error is returned and the timestamp cache is not updated.
//
// The timestamp cache is updated to txn.read_timestamp, like it is for all
// requests.
message RefreshRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  reserved 2;

  // refresh_from specifies the exclusive lower-bound of the verification. The
  // request verifies that there are no writes (committed or provisional) in the
  // range (refresh_from, txn.read_timestamp].
  util.hlc.Timestamp refresh_from = 3 [(gogoproto.nullable) = false];
}

// RefreshResponse is the response to a Refresh() operation.
message RefreshResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// RefreshRangeRequest is arguments to the RefreshRange() method, which
// is similar to RefreshRequest (see comments above), but operates on
// a key span instead of a single key.
message RefreshRangeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  reserved 2;

  // refresh_from specifies the exclusive lower-bound of the verification. The
  // request verifies that there are no writes (committed or provisional) in the
  // range (refresh_from, txn.read_timestamp].
  util.hlc.Timestamp refresh_from = 3 [(gogoproto.nullable) = false];
}

// RefreshRangeResponse is the response to a RefreshRange() operation.
message RefreshRangeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// SubsumeRequest is the argument to the Subsume() method, which freezes a range
// for merging with its left-hand neighbor.
//
// Subsume, when called correctly, provides important guarantees that ensure
// there is no moment in time where the ranges involved in the merge could both
// process commands for the same keys. See the comment on Subsume for details.
//
// Subsume may return stale MVCC statistics when used outside of a merge
// transaction. As a rule of thumb, it is incorrect to call Subsume, except from
// its carefully-chosen location within a merge transaction.
message SubsumeRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The range descriptor for the left-hand side of the merge. Used by the
  // right-hand side to sanity-check the validity of the merge.
  RangeDescriptor left_desc = 2 [(gogoproto.nullable) = false];
  // The range descriptor for the right-hand side of the merge. Should match
  // the range descriptor of the range evaluating this request.
  RangeDescriptor right_desc = 3 [(gogoproto.nullable) = false];
}

// SubsumeResponse is the response to a SubsumeRequest.
message SubsumeResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  reserved 2;

  // MVCCStats are the MVCC statistics for the range.
  storage.enginepb.MVCCStats mvcc_stats = 3 [
    (gogoproto.nullable) = false,
    (gogoproto.customname) = "MVCCStats"
  ];

  // LeaseAppliedIndex is the lease applied index of the last applied command
  // at the time that the Subsume request executed. This is NOT intended to be
  // the lease index of the SubsumeRequest itself. Instead, it is intended to
  // provide the sender of the Subsume request with an upper bound on the lease
  // applied index of the CPut that left an intent on the local copy of the
  // right-hand range descriptor.
  uint64 lease_applied_index = 4;

  // FreezeStart is a timestamp that is guaranteed to be greater than the
  // timestamps at which any requests were serviced by the responding replica
  // before it stopped responding to requests altogether (in anticipation of
  // being subsumed). It is suitable for use as the timestamp cache's low water
  // mark for the keys previously owned by the subsumed range though this role
  // is largely being... subsumed by the RightReadSummary.
  util.hlc.Timestamp freeze_start = 5 [(gogoproto.nullable) = false,
    (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/util/hlc.ClockTimestamp"];

  // closed_timestamp is the range's closed timestamp at the moment of the
  // subsumption. Because the SubsumeRequest synchronizes with all other
  // requests, the range's closed timestamp does not advance past the snapshot
  // captured here.
  //
  // Like the freeze_start, this is used by the merged range to conditionally
  // bump the timestamp cache for the keys previously owned by the subsumed
  // range.
  //
  // Note that the closed timestamp is also reflected in the read_summary.
  // However, we carry it explicitly too because, in case the leaseholders of
  // the two sides are collocated at merge time, we don't need to use the
  // read_summary and simply use this field.
  util.hlc.Timestamp closed_timestamp = 6 [(gogoproto.nullable) = false];

  // ReadSummary is a summary of the reads that have been performed on the range
  // up to the point of the Subsume request, which serializes with past reads
  // and begins blocking future reads. It is suitable for use to update the
  // timestamp cache for the keys previously owned by the subsumed range.
  //
  // ReadSummary can be used in place of FreezeStart, when available. It has two
  // distinct advantages:
  // 1. it can transfer a higher-resolution snapshot of the reads on the range
  //    through a range merge, to make the merge less disruptive to writes on
  //    the post-merge range because the timestamp cache won't be bumped as
  //    high.
  // 2. it can transfer information about reads with synthetic timestamps, which
  //    are not otherwise captured by the FreezeStart clock timestamp.
  kv.kvserver.readsummary.ReadSummary read_summary = 7;
}

// RangeStatsRequest is the argument to the RangeStats() method. It requests the
// MVCC statistics of the receiving range.
message RangeStatsRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// RangeStatsResponse is the response to a RangeStatsRequest.
message RangeStatsResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // MVCCStats are the MVCC statistics for the range that processed the
  // request.
  storage.enginepb.MVCCStats mvcc_stats = 2 [
    (gogoproto.nullable) = false,
    (gogoproto.customname) = "MVCCStats"
  ];

  // DeprecatedLastQueriesPerSecond is the most recent rate of request/s or QPS
  // for the range. The field is deprecated in favor of MaxQueriesPerSecond.
  //
  // TODO(nvanbenschoten): remove this field in v22.1 when all nodes in the
  // cluster are guaranteed to return MaxQueriesPerSecond.
  double deprecated_last_queries_per_second = 3;

  // MaxQueriesPerSecond is the maximum rate of request/s or QPS that the range
  // has served over a configured measurement period. Set to -1 if the replica
  // serving the RangeStats request has not been the leaseholder long enough to
  // have recorded request rates for at least a full measurement period. In such
  // cases, the recipient should not consider the QPS value reliable enough to
  // base important decisions off of.
  double max_queries_per_second = 5;

  // MaxQueriesPerSecondSet indicates that the MaxQueriesPerSecond field is set
  // by the server. Used to distinguish 0 qps set by a new server from the field
  // not being set at all by an old server.
  //
  // TODO(nvanbenschoten): stop consulting this field on the receiver in v22.1
  // when all nodes in the cluster are guaranteed to return MaxQueriesPerSecond.
  //
  // TODO(nvanbenschoten): stop setting this field and remove it in v22.2 when
  // no nodes in the cluster consult this field.
  bool max_queries_per_second_set = 6;

  // range_info contains descriptor and lease information.
  RangeInfo range_info = 4 [(gogoproto.nullable) = false];
}

// MigrateRequest is used instruct all ranges overlapping with it to exercise
// any relevant (below-raft) migrations in order for its range state to conform
// to what's needed by the specified version. It's a core primitive used in our
// migrations infrastructure to phase out legacy code below raft.
//
// KV waits for this command to durably apply on all replicas before returning,
// guaranteeing to the caller that all pre-migration state has been completely
// purged from the system.
message MigrateRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // Version is used to select the specific migration to exercise.
  Version version = 2 [(gogoproto.nullable) = false];
}

// MigrateResponse is the response to a Migrate operation.
message MigrateResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// QueryResolvedTimestampRequest is the argument to the QueryResolvedTimestamp()
// method. It requests a resolved timestamp for the key span it is issued over.
//
// A resolved timestamp for a key span is a timestamp at or below which all
// future reads within the span are guaranteed to produce the same results, i.e.
// at which MVCC history has become immutable. The most up-to-date such bound
// can be computed for a key span contained in a single range by taking the
// minimum of the leaseholder's closed timestamp and the timestamp preceding the
// earliest intent present on the range that overlaps with the key span of
// interest. This optimum timestamp is nondecreasing over time, since the closed
// timestamp will not regress and since it also prevents intents at lower
// timestamps from being created. Follower replicas can also provide a resolved
// timestamp, though it may not be the most recent one due to replication delay.
// However, a given follower replica will similarly produce a nondecreasing
// sequence of resolved timestamps.
//
// QueryResolvedTimestampRequest returns a resolved timestamp for the input key
// span by returning the minimum of all replicas contacted in order to cover the
// key span. This means that repeated invocations of this operation will be
// guaranteed nondecreasing only if routed to the same replicas.
//
// A CONSISTENT read at or below a key span's resolved timestamp will never
// block on replication or on conflicting transactions. However, as can be
// inferred from the previous paragraph, for this to be guaranteed, the read
// must be issued to the same replica or set of replicas (for multi-range reads)
// that were consulted when computing the key span's resolved timestamp.
//
// A resolved timestamp for a key span is a sibling concept a resolved timestamp
// for a rangefeed, which is defined in:
//   pkg/kv/kvserver/rangefeed/resolved_timestamp.go
// Whereas a resolved timestamp for a rangefeed refers to a timestamp below
// which no future updates will be published on the rangefeed, a resolved
// timestamp for a key span refers to a timestamp below which no future state
// modifications that could change the result of read requests will be made.
// Both concepts rely on some notion of immutability, but the former imparts
// this property on a stream of events while the latter imparts this property
// on materialized state.
message QueryResolvedTimestampRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// QueryResolvedTimestampResponse is the response to a QueryResolvedTimestampRequest.
message QueryResolvedTimestampResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // ResolvedTS is the resolved timestamp of the key span.
  util.hlc.Timestamp resolved_ts = 2 [
    (gogoproto.nullable) = false, (gogoproto.customname) = "ResolvedTS"];
}

// ScanInterleavedIntentsRequest is the request for a ScanInterleavedIntents operation.
// This is a read-only operation that returns all interleaved (non-separated)
// intents found over the request range.
message ScanInterleavedIntentsRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// ScanInterleavedIntentsResponse is the response to a ScanInterleavedIntents operation.
message ScanInterleavedIntentsResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // The intents encountered in the part of the request span that was
  // evaluated. A resume span is set in the response header if the entirety of
  // the request span was not evaluated.
  repeated Intent intents = 3 [(gogoproto.nullable) = false];
}

// BarrierRequest is the request for a Barrier operation. This goes through Raft
// and has the purpose of waiting until all conflicting in-flight operations on
// this range have completed, without blocking any new operations.
message BarrierRequest {
  RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}

// BarrierResponse is the response for a Barrier operation.
message BarrierResponse {
  ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];

  // Timestamp at which this Barrier was evaluated. Can be used to guarantee
  // future operations happen on the same or newer leaseholders.
  util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
}

// A RequestUnion contains exactly one of the requests.
// The values added here must match those in ResponseUnion.
//
// Be cautious about deprecating fields as doing so can lead to inconsistencies
// between replicas.
message RequestUnion {
  oneof value {
    GetRequest get = 1;
    PutRequest put = 2;
    ConditionalPutRequest conditional_put = 3;
    IncrementRequest increment = 4;
    DeleteRequest delete = 5;
    DeleteRangeRequest delete_range = 6;
    ClearRangeRequest clear_range = 38;
    RevertRangeRequest revert_range = 48;
    ScanRequest scan = 7;
    EndTxnRequest end_txn = 9;
    AdminSplitRequest admin_split = 10;
    AdminUnsplitRequest admin_unsplit = 47;
    AdminMergeRequest admin_merge = 11;
    AdminTransferLeaseRequest admin_transfer_lease = 29;
    AdminChangeReplicasRequest admin_change_replicas = 35;
    AdminRelocateRangeRequest admin_relocate_range = 45;
    HeartbeatTxnRequest heartbeat_txn = 12;
    GCRequest gc = 13;
    PushTxnRequest push_txn = 14;
    RecoverTxnRequest recover_txn = 46;
    ResolveIntentRequest resolve_intent = 16;
    ResolveIntentRangeRequest resolve_intent_range = 17;
    MergeRequest merge = 18;
    TruncateLogRequest truncate_log = 19;
    RequestLeaseRequest request_lease = 20;
    ReverseScanRequest reverse_scan = 21;
    ComputeChecksumRequest compute_checksum = 22;
    CheckConsistencyRequest check_consistency = 24;
    InitPutRequest init_put = 26;
    TransferLeaseRequest transfer_lease = 28;
    LeaseInfoRequest lease_info = 30;
    ExportRequest export = 32;
    QueryTxnRequest query_txn = 33;
    QueryIntentRequest query_intent = 42;
    AdminScatterRequest admin_scatter = 36;
    AddSSTableRequest add_sstable = 37;
    RecomputeStatsRequest recompute_stats = 39;
    RefreshRequest refresh = 40;
    RefreshRangeRequest refresh_range = 41;
    SubsumeRequest subsume = 43;
    RangeStatsRequest range_stats = 44;
    AdminVerifyProtectedTimestampRequest admin_verify_protected_timestamp = 49;
    MigrateRequest migrate = 50;
    QueryResolvedTimestampRequest query_resolved_timestamp = 51;
    ScanInterleavedIntentsRequest scan_interleaved_intents = 52;
    BarrierRequest barrier = 53;
    ProbeRequest probe = 54;
  }
  reserved 8, 15, 23, 25, 27, 31, 34;
}

// A ResponseUnion contains exactly one of the responses.
// The values added here must match those in RequestUnion.
message ResponseUnion {
  oneof value {
    GetResponse get = 1;
    PutResponse put = 2;
    ConditionalPutResponse conditional_put = 3;
    IncrementResponse increment = 4;
    DeleteResponse delete = 5;
    DeleteRangeResponse delete_range = 6;
    ClearRangeResponse clear_range = 38;
    RevertRangeResponse revert_range = 48;
    ScanResponse scan = 7;
    EndTxnResponse end_txn = 9;
    AdminSplitResponse admin_split = 10;
    AdminUnsplitResponse admin_unsplit = 47;
    AdminMergeResponse admin_merge = 11;
    AdminTransferLeaseResponse admin_transfer_lease = 29;
    AdminChangeReplicasResponse admin_change_replicas = 35;
    AdminRelocateRangeResponse admin_relocate_range = 45;
    HeartbeatTxnResponse heartbeat_txn = 12;
    GCResponse gc = 13;
    PushTxnResponse push_txn = 14;
    RecoverTxnResponse recover_txn = 46;
    ResolveIntentResponse resolve_intent = 16;
    ResolveIntentRangeResponse resolve_intent_range = 17;
    MergeResponse merge = 18;
    TruncateLogResponse truncate_log = 19;
    RequestLeaseResponse request_lease = 20;
    ReverseScanResponse reverse_scan = 21;
    ComputeChecksumResponse compute_checksum = 22;
    CheckConsistencyResponse check_consistency = 24;
    InitPutResponse init_put = 26;
    LeaseInfoResponse lease_info = 30;
    ExportResponse export = 32;
    QueryTxnResponse query_txn = 33;
    QueryIntentResponse query_intent = 42;
    AdminScatterResponse admin_scatter = 36;
    AddSSTableResponse add_sstable = 37;
    RecomputeStatsResponse recompute_stats = 39;
    RefreshResponse refresh = 40;
    RefreshRangeResponse refresh_range = 41;
    SubsumeResponse subsume = 43;
    RangeStatsResponse range_stats = 44;
    AdminVerifyProtectedTimestampResponse admin_verify_protected_timestamp = 49;
    MigrateResponse migrate = 50;
    QueryResolvedTimestampResponse query_resolved_timestamp = 51;
    ScanInterleavedIntentsResponse scan_interleaved_intents = 52;
    BarrierResponse barrier = 53;
    ProbeResponse probe = 54;
  }
  reserved 8, 15, 23, 25, 27, 28, 31, 34;
}

// A Header is attached to a BatchRequest, encapsulating routing and auxiliary
// information required for executing it.
message Header {
  // timestamp specifies time at which reads or writes should be performed. If
  // the timestamp is set to zero value, its value is initialized to the wall
  // time of the server node.
  //
  // Transactional requests are not allowed to set this field; they must rely on
  // the server to set it from txn.ReadTimestamp. Also, for transactional
  // requests, writes are performed at the provisional commit timestamp
  // (txn.WriteTimestamp).
  util.hlc.Timestamp timestamp = 1 [(gogoproto.nullable) = false];
  // timestamp_from_server_clock is set when a non-transactional BatchRequest
  // has its timestamp initialized to the wall time of the server node. When
  // non-nil, this denotes the clock timestamp that the timestamp field was
  // initially assigned upon arriving at the server node.
  //
  // It is needed to ensure that such requests are never served as follower
  // reads. Initializing the timestamp of a request on a node that holds a
  // follower instead of the leaseholder for a range and then using this
  // timestamp to deem a follower read safe could allow for consistency
  // violations where a non-transactional read following a write could fail to
  // observe the write.
  //
  // It is also needed to record the time at which the request was received by
  // the server node. The operation timestamp cannot serve this role because it
  // can change due to server-side uncertainty retries. By remembering a stable
  // reference to the initial timestamp, we ensure that a non-transactional
  // request's uncertainty interval remains fixed across retries.
  util.hlc.Timestamp timestamp_from_server_clock = 27 [
    (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/util/hlc.ClockTimestamp"];
  // replica specifies the destination of the request.
  ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false];
  // range_id specifies the ID of the Raft consensus group which the key
  // range belongs to. This is used by the receiving node to route the
  // request to the correct range.
  int64 range_id = 3 [(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
  // user_priority allows any command's priority to be biased from the
  // default random priority. It specifies a multiple. If set to 0.5,
  // the chosen priority will be 1/2x as likely to beat any default
  // random priority. If set to 1, a default random priority is
  // chosen. If set to 2, the chosen priority will be 2x as likely to
  // beat any default random priority, and so on. As a special case, 0
  // priority is treated the same as 1. This value is ignored if txn
  // is specified. The min and max user priorities are set via
  // MinUserPriority and MaxUserPriority in data.go.
  double user_priority = 4 [(gogoproto.casttype) = "UserPriority"];
  // txn is set non-nil if a transaction is underway. To start a txn,
  // the first request should set this field to non-nil with name and
  // isolation level set as desired. The response will contain the
  // fully-initialized transaction with txn ID, priority, initial
  // timestamp, and maximum timestamp.
  Transaction txn = 5;
  // read_consistency specifies the consistency for read
  // operations. The default is CONSISTENT. This value is ignored for
  // write operations.
  ReadConsistencyType read_consistency = 6;
  // routing_policy specifies how a request should be routed to the
  // replicas of its target range(s) by the DistSender. The default is
  // LEASEHOLDER, which means that the request should be routed to the
  // leaseholder(s) of its target range(s).
  //
  // The policy does not dictate which replicas in a range are eligible to
  // serve the request, only which replicas are considered as targets by
  // the DistSender, and in which order. A request that is routed to an
  // ineligible replica (a function of request type, timestamp, and read
  // consistency) will be rejected by that replica and the DistSender will
  // target another replica in the range.
  RoutingPolicy routing_policy = 19;
  // wait_policy specifies the desired behavior for the batch request if it
  // encounters a conflicting lock held by another active transaction.
  //
  // If an Error wait policy is set and a conflicting lock held by an active
  // transaction is encountered, a WriteIntentError will be returned.
  //
  // If the desired behavior is to block on the conflicting lock up to some
  // maximum duration, use the Block wait policy and set a context timeout.
  kv.kvserver.concurrency.lock.WaitPolicy wait_policy = 18;
  // lock_timeout specifies the maximum amount of time that the batch request
  // will wait while attempting to acquire a lock on a key or while blocking on
  // an existing lock in order to perform a non-locking read on a key. The time
  // limit applies separately to each lock acquisition attempt. If the timeout
  // elapses when waiting for a lock, a WriteIntentError will be returned.
  //
  // Unlike in some other systems like PostgreSQL, where non-locking reads do
  // not wait on conflicting locks, in CockroachDB, non-locking reads do wait
  // for conflicting locks to be released. Because of this, the lock_timeout
  // configuration applies to non-locking reads in read-write and read-only
  // transactions as well.
  //
  // Only the (default) Block wait policy will allow a request to wait on
  // conflicting locks, so the timeout only makes sense in conjunction with the
  // Block wait policy. The Error wait policy will throw an error immediately if
  // a conflicting lock held by an active transaction is encountered, so this
  // timeout can never be hit with an Error wait policy.
  //
  // A value of zero disables the timeout.
  google.protobuf.Duration lock_timeout = 21 [(gogoproto.nullable) = false,
                                              (gogoproto.stdduration) = true];
  // If set to a non-zero value, the total number of keys touched by requests in
  // the batch is limited. A resume span will be provided on the response of the
  // requests that were not able to run to completion before the limit was
  // reached.
  //
  // Overlapping requests
  //
  // The spans accessed by the requests are allowed to overlap. However, if any
  // requests overlap, the caller must be prepared to handle *multiple* partial
  // responses in the corresponding BatchResponse. If no requests overlap, then
  // only up to one request will return a partial result. Additionally, if two
  // requests touch the same key, it is double counted towards the key limit.
  //
  // Unordered requests
  //
  // The spans accessed by requests do not need to be in sorted order. However,
  // if the requests are not in sorted order (e.g. increasing key order for
  // Scans and other forward requests, decreasing key order for ReverseScans),
  // the caller must be prepared to handle empty responses interleaved with full
  // responses and one (or more, see "Overlapping requests") partial response
  // in the corresponding BatchResponse. If the requests are in sorted order,
  // the caller can expect to receive a group of full responses, one (or more)
  // partial responses, and a group of empty responses.
  //
  // Pagination of requests
  //
  // As discussed above, overlapping requests or unordered requests in batches
  // with a limit can lead to response batches with multiple partial responses.
  // In practice, this is because DistSender paginates request evaluation over
  // ranges in increasing key order (decreasing for reverse batches). As ranges
  // are iterated over in order, all requests that target a given range are sent
  // to it, regardless of their position in the batch. Once split and delivered
  // to a range, the applicable requests are executed in-full according to their
  // order in the batch.
  //
  // This behavior makes it difficult to make assumptions about the resume spans
  // of individual responses in batches that contain either overlapping requests
  // or unordered requests. As such, clients should not make assumptions about
  // resume spans and should instead inspect the result for every request in the
  // batch if if cannot guarantee that the batch is ordered with no overlapping
  // requests.
  //
  // Supported requests
  //
  // If a limit is provided, the batch can contain only the following range
  // request types:
  // - ScanRequest
  // - ReverseScanRequest
  // - DeleteRangeRequest(*)
  // - GetRequest
  // - RevertRangeRequest
  // - ResolveIntentRangeRequest
  //
  // The following two requests types are also allowed in the batch, although
  // the limit has no effect on them:
  // - QueryIntentRequest
  // - EndTxnRequest
  //
  // [*] DeleteRangeRequests are generally not allowed to be batched together
  // with a commit (i.e. 1PC), except if Require1PC is also set. See #37457.
  //
  // Forward requests and reverse requests cannot be mixed in the same batch if
  // a limit is set. There doesn't seem to be a fundamental reason for this
  // restriction, but a batch that mixed forward and reverse requests would be
  // impossible to order, so it would unavoidably have to deal with the added
  // complications discussed in "Unordered requests". For now, that's a good
  // enough reason to disallow such batches.
  int64 max_span_request_keys = 8;
  // If set to a non-zero value, sets a target (in bytes) for how large the
  // response may grow. This is only supported for (forward and reverse) scans
  // and limits the number of rows scanned (and returned). For cluster versions
  // 21.2 and below, the target will be overshot. For 22.1 and above, the
  // target will only be overshot when the first result is larger than the
  // target (see TargetBytesAvoidExcess cluster version), unless
  // target_bytes_allow_empty is set. A suitable resume span will be returned.
  //
  // The semantics around overlapping requests, unordered requests, and
  // supported requests from max_span_request_keys apply to the target_bytes
  // option as well.
  int64 target_bytes = 15;
  // If true, allow returning an empty result when the first result exceeds a
  // limit (e.g. TargetBytes). Only effective on 22.1 clusters with
  // TargetBytesAvoidExcess version gate enabled. Only supported by Get, Scan,
  // and ReverseScan.
  bool allow_empty = 23;
  // If positive, Scan and ReverseScan requests with limits (MaxSpanRequestKeys
  // or TargetBytes) will not return results with partial SQL rows at the end
  // (recall that SQL rows can span multiple keys). Such partial rows will be
  // removed from the result, unless AllowEmpty is false and the partial row is
  // the first result row, in which case additional keys will be fetched to
  // complete the row.
  //
  // The given value specifies the maximum number of keys in a row (i.e. the
  // number of column families). If any larger rows are found at the end of the
  // result, an error is returned.
  //
  // Added in 22.1, callers must check the ScanWholeRows version gate first.
  int32 whole_rows_of_size = 26;
  // If true, DistSender returns partial non-empty results when encountering a
  // range boundary, with an appropriate resume span and reason
  // RESUME_RANGE_BOUNDARY. This will disable parallelism of DistSender
  // requests, similarly to to the other limits above.
  //
  // This is useful in particular to prevent the DistSender from sending
  // requests with very small remaining limits (e.g. TargetBytes) to subsequent
  // ranges. Normally, as the DistSender iterates across ranges, each request's
  // limits will be reduced by the results returned from the previous range,
  // which can end up sending e.g. a request with TargetBytes=1 to the next
  // range in the worst case. Enabling this option would instead return the
  // partial result to the client, who then sends a new request with a new, full
  // limit. In aggregate, as the client keeps paginating, this can significantly
  // reduce the overall number of RPC requests sent to ranges, and thus the
  // overall latency of the operation.
  //
  // However, the benefit depends greatly on the limits, key spans, overlap with
  // range boundaries, as well as MVCC garbage. In some cases, it can be
  // detrimental due to the cost of additional client sender stack traversals,
  // so it is more appropriate for expensive requests (e.g. Scan) than cheap
  // requests (e.g. Get). Benchmarks have shown that this relative overhead
  // approaches zero as range RPC request latency increases past 1 ms.
  bool return_on_range_boundary = 24;
  // If set, all of the spans in the batch are distinct. Note that the
  // calculation of distinct spans does not include intents in an
  // EndTxnRequest. Currently set conservatively: a request might be
  // composed of distinct spans yet have this field set to false.
  bool distinct_spans = 9;
  // client_range_info represents the kvclient's knowledge about the state of
  // the range (i.e. of the range descriptor and lease). The kvserver checks
  // whether the client's info is up to date and, if it isn't, it will return a
  // RangeInfo with up-to-date information.
  ClientRangeInfo client_range_info = 17 [(gogoproto.nullable) = false];
  // gateway_node_id is the ID of the gateway node where the request originated.
  // For requests from tenants, this is set to the NodeID of the KV node handling
  // the BatchRequest.
  int32 gateway_node_id = 11 [(gogoproto.customname) = "GatewayNodeID", (gogoproto.casttype) = "NodeID"];
  // If set, the request will return to the client before proposing the
  // request into Raft. All consensus processing will be performed
  // asynchronously. Because consensus may fail, this means that the
  // request cannot be expected to succeed. Instead, its success must
  // be verified.
  // TODO(nvanbenschoten): Handling cases where consensus fails would
  // be much more straightforward if all transactional requests were
  // idempotent. We could just re-issue requests. See #26915.
  bool async_consensus = 13;
  // can_forward_read_timestamp indicates that the batch can be evaluated at a
  // higher timestamp than the transaction's read timestamp. The flag is only
  // applicable to transactional batches and is assumed to be true for all
  // non-transactional batches. It is set by the client if the transaction
  // has not performed any reads that must be refreshed prior to sending this
  // current batch. When set, it allows the server to handle pushes and write
  // too old conditions locally.
  bool can_forward_read_timestamp = 16;
  // bounded_staleness is set when a read-only batch is performing a bounded
  // staleness read and wants its timestamp to be chosen dynamically, based
  // on a resolved timestamp from its target replica(s).
  //
  // Transactional requests are not allowed to set this field. As a result,
  // the request can not span ranges and will be rejected by the DistSender
  // with an OpRequiresTxnError if it attempts to.
  //
  // Requests with a non-zero timestamp are not allowed to set this field.
  BoundedStalenessHeader bounded_staleness = 22;

  util.tracing.tracingpb.TraceInfo trace_info = 25 [(gogoproto.nullable) = false];

  reserved 7, 10, 12, 14, 20;
}

// BoundedStalenessHeader contains configuration values pertaining to bounded
// staleness read requests.
message BoundedStalenessHeader {
  // min_timestamp_bound places an (inclusive) lower bound on the dynamically
  // chosen timestamp during a bounded staleness read. During such reads, a
  // resolved timestamp over the request's key span will initially be computed
  // on the target replica. If the resolved timestamp is equal to or greater
  // than the minimum timestamp bound, then the batch timestamp will be set to
  // the resolved timestamp, the batch will be evaluated, and the negotiated
  // timestamp will be recorded in the BatchResponse Header. If the resolved
  // timestamp is less than the minimum timestamp bound, then the request will
  // either be rejected with a MinTimestampBoundUnsatisfiableError or will be
  // redirected to the leaseholder and permitted to block on conflicting
  // transactions, depending on the value of min_timestamp_bound_strict.
  //
  // This field is required when a BoundedStalenessHeader is set in a Header.
  util.hlc.Timestamp min_timestamp_bound = 1 [(gogoproto.nullable) = false];
  // min_timestamp_bound_strict specifies whether a bounded staleness read
  // whose min_timestamp_bound cannot be satisfied by the first replica it
  // visits (subject to routing_policy) without blocking should be rejected
  // with a MinTimestampBoundUnsatisfiableError or will be redirected to the
  // leaseholder and permitted to block on conflicting transactions. If the
  // flag is true, blocking is never permitted and users should be prepared
  // to handle MinTimestampBoundUnsatisfiableErrors. If the flag is false,
  // blocking is permitted and MinTimestampBoundUnsatisfiableErrors will never
  // be returned.
  bool min_timestamp_bound_strict = 2;
  // max_timestamp_bound places an (exclusive) upper bound on the dynamically
  // chosen timestamp during a bounded staleness read. If the field is set and a
  // resolved timestamp over the request's key span is computed to be greater
  // than or equal to the maximum timestamp bound, the batch timestamp will be
  // set to the timestamp preceding the maximum timestamp bound instead, then
  // the batch will be evaluated at this timestamp, and it will be recorded in
  // the BatchResponse Header.
  //
  // This field is optional when a BoundedStalenessHeader is set in a Header.
  // However, if the field is set, max_timestamp_bound must be greater than
  // min_timestamp_bound.
  util.hlc.Timestamp max_timestamp_bound = 3 [(gogoproto.nullable) = false];
}

// AdmissionHeader contains information utilized by admission control for the
// request.
message AdmissionHeader {
  // Priority is utilized within a tenant. See admission.WorkPriority.
  int32 priority = 1;
  // CreateTime is equivalent to Time.UnixNano() at the creation time of this
  // request or a parent request. See admission.WorkInfo.Priority for details.
  // It is used to give preference to older requests.
  int64 create_time = 2;

  // Source represents the immediate source of a request. FROM_SQL represents
  // a KV request originating in SQL, and ROOT_KV represents a request
  // originating within KV, but at the root of the tree of requests.
  // BatchRequests can cause other BatchRequests (e.g. to PushTransaction),
  // which may result in yet more BatchRequests (e.g. to
  // ResolveIndeterminateCommit). These can cause a deadlock when using slot
  // based admission control. Only FROM_SQL and ROOT_KV requests are subject
  // to admission control. This behavior errs on the side of no deadlock, if
  // we overlooked an instrumentation point, since the default value of OTHER
  // bypasses admission control.
  enum Source {
    OTHER = 0;
    FROM_SQL = 1;
    ROOT_KV = 2;
  }
  Source source = 3;

  // SourceLocation specifies physically where the call originated. LOCAL
  // means the client is collocated on the same node as the server. It is set
  // on codepaths that use internalClientAdapter which avoids using gRPC for
  // local calls to the KV API.
  enum SourceLocation {
    REMOTE = 0;
    LOCAL = 1;
  }
  SourceLocation source_location = 4;

  // NoMemoryReservedAtSource is set by the source/client when it has
  // effectively reserved close to 0 bytes. It is read by the server only when
  // SourceLocation=LOCAL, to differentiate this 0 reservation case from the
  // case where the client has already reserved enough memory based on
  // previous responses. In the latter case the server avoids reserving more
  // since it would result in double counting. Not setting this defaults to
  // turning off server reserving more memory -- this optimistic choice was
  // made to err on the side of avoiding double counting in case we forget to
  // instrument some calling path.
  //
  // NOTE: This field is a temporary field until we move to comprehensive
  // accounting at the client, by reserving all the bytes for responses, and
  // explicitly propagating these through DistSender to the servers, so the
  // servers can (best-effort) respect these values when producing responses.
  // In that future world, the local server will explicitly know what has
  // already been accounted for, and can start reserving more only when it
  // exceeds.
  bool no_memory_reserved_at_source = 5;
}

// A BatchRequest contains one or more requests to be executed in
// parallel, or if applicable (based on write-only commands and
// range-locality), as a single update.
message BatchRequest {
  option (gogoproto.goproto_stringer) = false;

  Header header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  repeated RequestUnion requests = 2 [(gogoproto.nullable) = false];
  AdmissionHeader admission_header = 3 [(gogoproto.nullable) = false];
}

// A BatchResponse contains one or more responses, one per request
// corresponding to the requests in the matching BatchRequest. The
// error in the response header is set to the first error from the
// slice of responses, if applicable.
message BatchResponse {
  option (gogoproto.goproto_stringer) = false;

  message Header {
    reserved 4;
    // error communicates a structured error (i.e. one originating from a Node)
    // while the BatchResponse is sent over the network. If the code were
    // written today, the RPC endpoint would return a message containing both a
    // BatchResponse and an Error, and this embedding would go away. However, it
    // returns only a BatchResponse, and so the Error needs to be tucked away
    // somewhere (the structured error cannot be communicated via an RPC-level
    // error).
    //
    // Outside of the RPC boundaries, this field is nil and must neither be
    // checked nor populated (it is reset by the DistSender, which extracts this
    // error and returns it separately). In effect, nearly no usage of
    // BatchResponse needs to care about this field.
    kv.kvpb.Error error = 1;
    // timestamp denotes the timestamp at which the batch's reads executed. The
    // timestamp cache is updated at this timestamp.
    //
    // TODO(tbg): for transactional requests, this duplicates `Txn.ReadTimestamp`,
    // which is awkward. We could consider making this field optional and only
    // populate it for non-transactional requests. The timestamp cache would then
    // use an accessor on BatchResponse to pick the one that matters.
    util.hlc.Timestamp Timestamp = 2 [(gogoproto.nullable) = false];
    // txn is non-nil if the request specified a non-nil
    // transaction. The transaction timestamp and/or priority may have
    // been updated, depending on the outcome of the request.
    Transaction txn = 3;
    // now is the highest current time from any node contacted during the request.
    // It can be used by the receiver to update its local HLC.
    util.hlc.Timestamp now = 5 [(gogoproto.nullable) = false,
      (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/util/hlc.ClockTimestamp"];
    // collected_spans stores trace spans recorded during the execution of this
    // request.
    repeated util.tracing.tracingpb.RecordedSpan collected_spans = 6 [(gogoproto.nullable) = false];
    // Range used to execute the request. The server only populates this if the
    // server detects the client's client_range_info to be stale. Otherwise, it
    // is left empty. Not set when Error is set.
    //
    // The server may also include additional RangeInfo objects if it suspects
    // that the client may be interested in them. This is currently the case
    // immediately after a Range split, where a stale client_range_info from
    // before the split is taken to be an indication that the client may be
    // interested in information about both the LHS and RHS post-split ranges.
    //
    // The field is cleared by the DistSender because it refers routing
    // information not exposed by the KV API.
    repeated RangeInfo range_infos = 7 [(gogoproto.nullable) = false];
    // NB: if you add a field here, don't forget to update combine().
  }
  Header header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  repeated ResponseUnion responses = 2 [(gogoproto.nullable) = false];
}

// RangeLookupRequest is a request to proxy a RangeLookup through a Tenant
// service. Its fields correspond to a subset of the args of kv.RangeLookup.
message RangeLookupRequest {
  bytes               key              = 1 [(gogoproto.casttype) = "RKey"];
  ReadConsistencyType read_consistency = 2;
  int64               prefetch_num     = 3;
  bool                prefetch_reverse = 4;
}

// RangeLookupResponse is returned from a RangeLookup request proxied through a
// Tenant service. Its fields correspond to the return values of kv.RangeLookup.
message RangeLookupResponse {
  repeated RangeDescriptor descriptors            = 1 [(gogoproto.nullable) = false];
  repeated RangeDescriptor prefetched_descriptors = 2 [(gogoproto.nullable) = false];
  // If non-nil, the other fields will be empty.
  kv.kvpb.Error error = 3;
}

// RangeFeedRequest is a request that expresses the intention to establish a
// RangeFeed stream over the provided span, starting at the specified timestamp.
message RangeFeedRequest {
  Header header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
  Span   span   = 2 [(gogoproto.nullable) = false];
  // with_diff specifies whether RangeFeedValue updates should contain the
  // previous value that was overwritten.
  bool with_diff = 3;
  // AdmissionHeader is used only at the start of the range feed stream, since
  // the initial catch-up scan be expensive.
  AdmissionHeader admission_header = 4 [(gogoproto.nullable) = false];
}

// RangeFeedValue is a variant of RangeFeedEvent that represents an update to
// the specified key with the provided value.
message RangeFeedValue {
  bytes key       = 1 [(gogoproto.casttype) = "Key"];
  Value value     = 2 [(gogoproto.nullable) = false];
  // prev_value is only populated if both:
  // 1. with_diff was passed in the corresponding RangeFeedRequest.
  // 2. the key-value was present and not a deletion tombstone before
  //    this event.
  // The timestamp on the previous value is empty.
  Value prev_value = 3 [(gogoproto.nullable) = false];
}

// RangeFeedCheckpoint is a variant of RangeFeedEvent that represents the
// promise that no more RangeFeedValue events with keys in the specified span
// with timestamps less than or equal to the specified resolved timestamp will
// be emitted on the RangeFeed response stream.
//
// Note that these resolved timestamps may be lower than the timestamp used in
// the RangeFeedRequest used to start the RangeFeed.
message RangeFeedCheckpoint {
  Span               span        = 1 [(gogoproto.nullable) = false];
  util.hlc.Timestamp resolved_ts = 2 [
    (gogoproto.nullable) = false, (gogoproto.customname) = "ResolvedTS"];
}

// RangeFeedError is a variant of RangeFeedEvent that indicates that an error
// occurred during the processing of the RangeFeed. If emitted, a RangeFeedError
// event will always be the final event on a RangeFeed response stream before
// it is torn down.
message RangeFeedError {
  kv.kvpb.Error error = 1 [(gogoproto.nullable) = false];
}

// RangeFeedSSTable is a variant of RangeFeedEvent that represents an AddSSTable
// operation, containing the entire ingested SST. It is only emitted for
// SSTables written with SSTTimestampToRequestTimestamp enabled, so it is
// guaranteed to comply with the closed timestamp. The Span and WriteTS fields
// are advisory, and contain the client-provided SST key span (may be wider than
// the SST data) and the MVCC timestamp used for all contained entries.
//
// The entire SST is emitted even for registrations that have a narrower span,
// it is up to the caller to prune the SST as appropriate. Catchup scans emit
// the data as RangeFeedValue events instead (i.e. it reads the ingested KV
// pairs), but Raft log replay will emit RangeFeedSSTable events.
message RangeFeedSSTable {
  bytes              data     = 1;
  Span               span     = 2 [(gogoproto.nullable) = false];
  util.hlc.Timestamp write_ts = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "WriteTS"];
}

// RangeFeedEvent is a union of all event types that may be returned on a
// RangeFeed response stream.
message RangeFeedEvent {
  option (gogoproto.onlyone) = true;

  RangeFeedValue      val        = 1;
  RangeFeedCheckpoint checkpoint = 2;
  RangeFeedError      error      = 3;
  RangeFeedSSTable    sst        = 4 [(gogoproto.customname) = "SST"];
}


// ResetQuorumRequest makes a range that is unavailable due to lost quorum
// available again, at the cost of losing all of the data in the range. Any
// existing replica, even one residing on the target node, will irrevocably
// be removed. ResetQuorumRequest first uses meta2 to identify the range
// descriptor. Then, it removes all replicas from the range descriptor and
// adds a store from the target node as the one designated survivor replica.
// This change is then written to meta2 and sent as a snapshot to a store
// local to the target node.
//
// This RPC is called by the user directly and will not work for ranges
// that have not lost quorum or for a meta range.
message ResetQuorumRequest {
  int32 range_id = 1 [(gogoproto.customname) = "RangeID"];
}

message ResetQuorumResponse {}

// GossipSubscriptionRequest initiates a game of telephone. It establishes an
// indefinite stream that proxies gossip information overheard by the recipient
// node back to the caller. Gossip information is filtered down to just those
// identified by a key matching any of the specified patterns.
//
// Upon establishment of the stream, all existing information that matches one
// or more of the patterns is returned. After this point, only new information
// matching the patterns is returned.
message GossipSubscriptionRequest {
  repeated string patterns = 1;
}

// GossipSubscriptionEvent is a single piece of proxied gossip information.
message GossipSubscriptionEvent {
  string key     = 1;
  Value  content = 2 [(gogoproto.nullable) = false];
  // Which pattern does this gossip information match?
  string pattern_matched = 3;
  // If non-nil, the other fields will be empty and this will be the final event
  // send on the stream before it is terminated.
  kv.kvpb.Error error = 4;
}

// TenantSettingsRequest establishes an indefinite stream that provides
// up-to-date overrides for tenant settings.
//
// Upon establishment of the stream, the current overrides are returned as an
// event, and any time the overrides change a new event is generated.
message TenantSettingsRequest {
  TenantID tenant_id = 1 [(gogoproto.customname) = "TenantID", (gogoproto.nullable) = false];
}

// TenantSettingsEvent is used to report changes to setting overrides for a
// particular tenant.
//
// Each event pertains to a certain precedence value (see
// TenantSettingsPrecedence).
//
// Note: this API is designed to allow flexibility of implementation on the
// server side (e.g. to make it maintain very little state per tenant).
message TenantSettingsEvent {
  // Precedence must be a valid TenantSettingsPrecedence value.
  uint32 precedence = 1 [(gogoproto.casttype) = "TenantSettingsPrecedence"];

  // Incremental is true if the list of overrides is a list of changes since the
  // last event. In that case, any overrides that have been removed are returned
  // as TenantSettings with empty RawValue and ValueType fields.
  //
  // When Incremental is false, the overrides contains the complete list of
  // current overrides for this precedence.
  //
  // The first event for a precedence is never incremental.
  bool incremental = 2;

  // Overrides contains:
  //  - all current setting overrides for the given precedence if Incremental is
  //    false; or
  //  - the changed overrides since the last event for the precedence if
  //    Incremental is true (removed overrides have empty RawValue and ValueType
  //    fields).
  repeated TenantSetting overrides = 3 [(gogoproto.nullable) = false];

  // If non-nil, the other fields will be empty and this will be the final event
  // sent on the stream before it is terminated.
  errorspb.EncodedError error = 4 [(gogoproto.nullable) = false];
}

// TenantSetting contains the name and value of a tenant setting.
message TenantSetting {
  string name = 1;
  settings.EncodedValue value = 2 [(gogoproto.nullable) = false];
}

// TenantConsumption contains information about resource utilization by a
// tenant, which directly factors into their bill.
message TenantConsumption {
  double r_u = 1;
  uint64 read_requests = 2;
  uint64 read_bytes = 3;
  uint64 write_requests = 4;
  uint64 write_bytes = 5;
  double sql_pods_cpu_seconds = 6 [(gogoproto.customname) = "SQLPodsCPUSeconds"];
  uint64 pgwire_egress_bytes = 7 [(gogoproto.customname) = "PGWireEgressBytes"];

  // Note: if any fields are changed, the Sub and Add methods must be updated.
}

// TokenBucketRequest is used by tenants to obtain Request Units and report
// consumption.
message TokenBucketRequest {
  // Consumption that occurred since this node's last request.
  TenantConsumption consumption_since_last_request = 1 [(gogoproto.nullable) = false];

  uint64 tenant_id = 2 [(gogoproto.customname) = "TenantID"];

  // InstanceID is the ID of the SQL pod instance from where the request
  // originates.
  uint32 instance_id = 3 [(gogoproto.customname) = "InstanceID"];

  // NextLiveInstanceID is the ID of the next live SQL instance for this tenant,
  // in circular order by ID.
  //
  // In other words:
  //  - if this instance has the largest ID, NextLiveInstanceID is the smallest
  //    ID of a live instance.
  //  - otherwise, NextLiveInstanceID is the smallest ID of a live instance that
  //    is greater than InstanceID.
  //
  // In particular, if this is the only live instance, then NextLiveInstanceID
  // will equal InstanceID.
  //
  // The information in this field is used by the server to trigger checking for
  // dead instances; it is acceptable for the information to be stale.
  //
  // Can be zero if this information is not currently available.
  uint32 next_live_instance_id = 8 [(gogoproto.customname) = "NextLiveInstanceID"];

  // InstanceLease uniquely identifies the SQL pod instance from where the
  // request originates, in light of ID reuse.
  bytes instance_lease = 4;

  // SeqNum is a strictly positive, monotonically increasing sequence number,
  // used to detect duplicate requests on the server side (to avoid
  // double-charging). The consumption reported in a request with a lower
  // sequence number than the last request is ignored.
  int64 seq_num = 7;

  // RequestedRU is the number of requested tokens (RUs).
  double requested_r_u = 5;

  // TargetRequestPeriod is the approximate frequency of requests, used to limit
  // TrickleDuration in the response.
  google.protobuf.Duration target_request_period = 6 [(gogoproto.nullable) = false,
                                                      (gogoproto.stdduration) = true];
}

message TokenBucketResponse {
  // If non-empty, the other fields will be empty. This field stores any error
  // that occurs on the server, allowing us to differentiate between those and
  // RPC errors.
  errorspb.EncodedError error = 1 [(gogoproto.nullable) = false];

  // GrantedTokens is the amount of RUs granted. In most cases, this is equal to
  // the requested RUs; the only exception is when TrickleDuration would
  // otherwise exceed TargetRequestPeriod, in which case the number of granted
  // RUs is reduced.
  double granted_r_u = 2;

  // TrickleDuration describes how the granted tokens can be consumed.
  //
  // If zero, the granted tokens can be used immediately, without restriction.
  // If set, the granted tokens become available at a constant rate over the
  // trickle time period. E.g. if we are granted 1000 tokens and the trickle
  // duration is 10 seconds, the tokens become available at a rate of
  // 100 tokens/sec for the next 10 seconds.
  //
  // TrickleDuration is at most the given TargetRequestPeriod.
  google.protobuf.Duration trickle_duration = 3 [(gogoproto.nullable) = false,
                                                 (gogoproto.stdduration) = true];

  // FallbackRate indicates a rate (in RU/s) that the instance will use once it
  // runs out of tokens and a problem prevents TokenBucket requests from
  // completing.
  double fallback_rate = 4;
}

// JoinNodeRequest is used to specify to the server node what the client's
// binary version is. If it's not compatible with the rest of the cluster, the
// join attempt is refused.
message JoinNodeRequest {
  roachpb.Version binary_version = 1;
}

// JoinNodeResponse informs the joining node what the cluster id is, what
// node id was allocated to it, and what store ID to use for its first store. It
// also informs the node what the current active version is.
message JoinNodeResponse {
  bytes cluster_id = 1 [(gogoproto.customname) = "ClusterID"];
  int32 node_id = 2 [(gogoproto.customname) = "NodeID"];
  int32 store_id = 3 [(gogoproto.customname) = "StoreID"];
  roachpb.Version active_version = 4;
}

// Batch and RangeFeed service implemented by nodes for KV API requests.
service Internal {
  rpc Batch              (BatchRequest)              returns (BatchResponse)                  {}
  rpc RangeLookup        (RangeLookupRequest)        returns (RangeLookupResponse)            {}
  rpc RangeFeed          (RangeFeedRequest)          returns (stream RangeFeedEvent)          {}
  rpc GossipSubscription (GossipSubscriptionRequest) returns (stream GossipSubscriptionEvent) {}
  rpc ResetQuorum        (ResetQuorumRequest)        returns (ResetQuorumResponse)            {}

  // TokenBucket is used by tenants to obtain Request Units and report
  // consumption.
  rpc TokenBucket        (TokenBucketRequest)        returns (TokenBucketResponse)            {}

  // Join a bootstrapped cluster. If the target node is itself not part of a
  // bootstrapped cluster, an appropriate error is returned.
  rpc Join(JoinNodeRequest) returns (JoinNodeResponse) { }

  // GetSpanConfigs is used to fetch the span configurations over a given
  // keyspan.
  rpc GetSpanConfigs (GetSpanConfigsRequest) returns (GetSpanConfigsResponse) { }

  // UpdateSpanConfigs is used to update the span configurations over given
  // keyspans.
  rpc UpdateSpanConfigs (UpdateSpanConfigsRequest) returns (UpdateSpanConfigsResponse) { }

  // TenantSettings is used by tenants to obtain and stay up to date with tenant
  // setting overrides.
  rpc TenantSettings (TenantSettingsRequest) returns (stream TenantSettingsEvent) { }
}

// ContentionEvent is a message that will be attached to BatchResponses
// indicating any conflicts with another transaction during replica evaluation.
message ContentionEvent {
  option (gogoproto.goproto_stringer) = false;

  // Key is the key that this and the other transaction conflicted on.
  bytes key = 1 [(gogoproto.casttype) = "Key"];
  // TxnMeta is the transaction conflicted
  // with, i.e. the transaction holding a lock.
  cockroach.storage.enginepb.TxnMeta txn_meta = 2 [(gogoproto.nullable) = false];
  // Duration spent contending against the other transaction.
  google.protobuf.Duration duration = 3 [(gogoproto.nullable) = false,
                                         (gogoproto.stdduration) = true];
}

// ScanStats is a message that will be attached to BatchResponses containing
// information about what happened during each scan and get in the request.
message ScanStats {
  option (gogoproto.goproto_stringer) = false;

  uint64 num_interface_seeks = 1;
  uint64 num_internal_seeks = 2;
  uint64 num_interface_steps = 3;
  uint64 num_internal_steps = 4;
}