Skip to content

Commit

Permalink
server, tenant: gate process debugging behind capability
Browse files Browse the repository at this point in the history
Previously, all tenant servers were started with a debug server
that granted process-manipulating power via pprof and vmodule HTTP
endpoints.

This implementation is fine when servers serve just 1 tenant in
a given process; that tenant then legitimately has access to all
process-level control. However, it becomes a problem in shared-process
multitenancy, when the same process is shared by multiple tenants. In
that case, it is undesirable for 1 tenant to have access to & control
process properties, as it could influence the well-functioning of
other tenants or potentially leak data across tenant boundaries.

This commit gates access to the debug server behind a capability
**only with shared process multitenancy**. Tenant servers running
within their own process will contain a debug server with no
capability gating since they own their process.

The gating is implemented via a tenant authorizer function backed
by the capability store that is injected into the debug server
upon startup. Shared process tenants are provided this authorizer,
while separate process tenants and the system tenant use the no-op
authorizer.

Epic: CRDB-12100
Resolves: #97946

Release note: None

Co-authored-by: Raphael 'kena' Poss <[email protected]>
  • Loading branch information
dhartunian and knz committed Jun 26, 2023
1 parent 763b105 commit 4fb92bf
Show file tree
Hide file tree
Showing 18 changed files with 310 additions and 49 deletions.
11 changes: 11 additions & 0 deletions pkg/ccl/logictestccl/testdata/logic_test/tenant_capability
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -61,6 +62,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -79,6 +81,7 @@ can_admin_scatter true
can_admin_split false
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -104,6 +107,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -129,6 +133,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -154,6 +159,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info true
can_view_tsdb_metrics false
Expand All @@ -172,6 +178,7 @@ can_admin_scatter true
can_admin_split false
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -190,6 +197,7 @@ can_admin_scatter true
can_admin_split false
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand All @@ -208,6 +216,7 @@ can_admin_scatter true
can_admin_split false
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand Down Expand Up @@ -268,6 +277,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand Down Expand Up @@ -297,6 +307,7 @@ can_admin_scatter true
can_admin_split true
can_admin_unsplit false
can_check_consistency false
can_debug_process false
can_use_nodelocal_storage false
can_view_node_info false
can_view_tsdb_metrics false
Expand Down
133 changes: 133 additions & 0 deletions pkg/ccl/serverccl/server_sql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ import (
"errors"
"fmt"
"io"
"net/http"
"strings"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/ccl"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl/licenseccl"
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher/systemconfigwatchertest"
Expand Down Expand Up @@ -190,6 +192,137 @@ func TestTenantHTTP(t *testing.T) {

}

// TestTenantProcessDebugging verifies that in-process SQL tenant servers gate
// process debugging behind capabilities.
func TestTenantProcessDebugging(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

tc := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{ServerArgs: base.TestServerArgs{
DefaultTestTenant: base.TestTenantDisabled,
}})
defer tc.Stopper().Stop(ctx)
db := tc.ServerConn(0)
defer db.Close()

s := tc.Server(0)

tenant, _, err := s.StartSharedProcessTenant(ctx,
base.TestSharedProcessTenantArgs{
TenantID: serverutils.TestTenantID(),
TenantName: "processdebug",
})
require.NoError(t, err)
defer tenant.Stopper().Stop(ctx)

t.Run("system tenant pprof", func(t *testing.T) {
httpClient, err := s.GetAdminHTTPClient()
require.NoError(t, err)
defer httpClient.CloseIdleConnections()

url := s.AdminURL().URL
url.Path = url.Path + "/debug/pprof/goroutine"
q := url.Query()
q.Add("debug", "2")
url.RawQuery = q.Encode()

resp, err := httpClient.Get(url.String())
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)

require.Equal(t, http.StatusOK, resp.StatusCode)
require.Contains(t, string(body), "goroutine")
})

t.Run("pprof", func(t *testing.T) {
httpClient, err := tenant.GetAdminHTTPClient()
require.NoError(t, err)
defer httpClient.CloseIdleConnections()

url := tenant.AdminURL().URL
url.Path = url.Path + "/debug/pprof/"
q := url.Query()
q.Add("debug", "2")
url.RawQuery = q.Encode()

resp, err := httpClient.Get(url.String())
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, http.StatusForbidden, resp.StatusCode)
require.Contains(t, string(body), "tenant does not have capability to debug the running process")

_, err = db.Exec(`ALTER TENANT processdebug GRANT CAPABILITY can_debug_process=true`)
require.NoError(t, err)

tc.WaitForTenantCapabilities(t, serverutils.TestTenantID(), map[tenantcapabilities.ID]string{
tenantcapabilities.CanDebugProcess: "true",
})

resp, err = httpClient.Get(url.String())
require.NoError(t, err)
defer resp.Body.Close()
body, err = io.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, http.StatusOK, resp.StatusCode)
require.Contains(t, string(body), "goroutine")

_, err = db.Exec(`ALTER TENANT processdebug REVOKE CAPABILITY can_debug_process`)
require.NoError(t, err)

tc.WaitForTenantCapabilities(t, serverutils.TestTenantID(), map[tenantcapabilities.ID]string{
tenantcapabilities.CanDebugProcess: "false",
})
})

t.Run("vmodule", func(t *testing.T) {
httpClient, err := tenant.GetAdminHTTPClient()
require.NoError(t, err)
defer httpClient.CloseIdleConnections()

url := tenant.AdminURL().URL
url.Path = url.Path + "/debug/vmodule"
q := url.Query()
q.Add("duration", "-1s")
q.Add("vmodule", "exec_log=3")
url.RawQuery = q.Encode()

resp, err := httpClient.Get(url.String())
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, http.StatusForbidden, resp.StatusCode)
require.Contains(t, string(body), "tenant does not have capability to debug the running process")

_, err = db.Exec(`ALTER TENANT processdebug GRANT CAPABILITY can_debug_process=true`)
require.NoError(t, err)

tc.WaitForTenantCapabilities(t, serverutils.TestTenantID(), map[tenantcapabilities.ID]string{
tenantcapabilities.CanDebugProcess: "true",
})

resp, err = httpClient.Get(url.String())
require.NoError(t, err)
defer resp.Body.Close()
body, err = io.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, http.StatusOK, resp.StatusCode)
require.Contains(t, string(body), "previous vmodule configuration: \nnew vmodule configuration: exec_log=3\n")

_, err = db.Exec(`ALTER TENANT processdebug REVOKE CAPABILITY can_debug_process`)
require.NoError(t, err)

tc.WaitForTenantCapabilities(t, serverutils.TestTenantID(), map[tenantcapabilities.ID]string{
tenantcapabilities.CanDebugProcess: "false",
})
})
}

func TestNonExistentTenant(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down
6 changes: 6 additions & 0 deletions pkg/multitenant/tenantcapabilities/capabilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ const (
// span configs.
TenantSpanConfigBounds // span_config_bounds

// CanDebugProcess describes the ability of a tenant to set vmodule on the
// process and run pprof profiles and tools. This can reveal information
// across tenant boundaries.
CanDebugProcess // can_debug_process

MaxCapabilityID ID = iota - 1
)

Expand Down Expand Up @@ -114,6 +119,7 @@ var capabilities = [MaxCapabilityID + 1]Capability{
CanViewTSDBMetrics: boolCapability(CanViewTSDBMetrics),
ExemptFromRateLimiting: boolCapability(ExemptFromRateLimiting),
TenantSpanConfigBounds: spanConfigBoundsCapability(TenantSpanConfigBounds),
CanDebugProcess: boolCapability(CanDebugProcess),
}

// EnableAll enables maximum access to services.
Expand Down
9 changes: 7 additions & 2 deletions pkg/multitenant/tenantcapabilities/id_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/multitenant/tenantcapabilities/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ type Authorizer interface {
// IsExemptFromRateLimiting returns true of the tenant should
// not be subject to rate limiting.
IsExemptFromRateLimiting(ctx context.Context, tenID roachpb.TenantID) bool

// HasProcessDebugCapability returns an error if a tenant, referenced by its ID,
// is not allowed to debug the running process.
HasProcessDebugCapability(ctx context.Context, tenID roachpb.TenantID) error
}

// Entry ties together a tenantID with its capabilities.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,10 @@ func (n *AllowEverythingAuthorizer) IsExemptFromRateLimiting(
) bool {
return true
}

// HasProcessDebugCapability implements the tenantcapabilities.Authorizer interface.
func (n *AllowEverythingAuthorizer) HasProcessDebugCapability(
ctx context.Context, tenID roachpb.TenantID,
) error {
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,10 @@ func (n *AllowNothingAuthorizer) HasNodelocalStorageCapability(
func (n *AllowNothingAuthorizer) IsExemptFromRateLimiting(context.Context, roachpb.TenantID) bool {
return false
}

// HasProcessDebugCapability implements the tenantcapabilities.Authorizer interface.
func (n *AllowNothingAuthorizer) HasProcessDebugCapability(
ctx context.Context, tenID roachpb.TenantID,
) error {
return errors.New("operation blocked")
}
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,32 @@ func (a *Authorizer) getMode(
}
return cp, selectedMode
}

func (a *Authorizer) HasProcessDebugCapability(ctx context.Context, tenID roachpb.TenantID) error {
if tenID.IsSystem() {
return nil
}
errFn := func() error {
return errors.New("client tenant does not have capability to debug the process")
}
cp, mode := a.getMode(ctx, tenID)
switch mode {
case authorizerModeOn:
break // fallthrough to the next check.
case authorizerModeAllowAll:
return nil
case authorizerModeV222:
return errFn()
default:
err := errors.AssertionFailedf("unknown authorizer mode: %d", mode)
logcrash.ReportOrPanic(ctx, &a.settings.SV, "%v", err)
return err
}

if !tenantcapabilities.MustGetBoolByID(
cp, tenantcapabilities.CanDebugProcess,
) {
return errFn()
}
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ client tenant does not have capability "can_admin_scatter" (*kvpb.AdminScatterRe

has-capability-for-batch ten=10 cmds=(Merge)
----
client tenant does not have capability "ID(12)" (*kvpb.MergeRequest)
client tenant does not have capability "ID(13)" (*kvpb.MergeRequest)

has-node-status-capability ten=10
----
Expand All @@ -34,7 +34,7 @@ ok

has-capability-for-batch ten=10 cmds=(Merge)
----
client tenant does not have capability "ID(12)" (*kvpb.MergeRequest)
client tenant does not have capability "ID(13)" (*kvpb.MergeRequest)

has-node-status-capability ten=10
----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ message TenantCapabilities {
// CanCheckConsistency if set to true, grants the tenant the ability to run
// range consistency checks.
bool can_check_consistency = 10;

// CanDebugProcess, if set to true, grants the tenant the ability to
// set vmodule on the process and run pprof profiles and tools.
bool can_debug_process = 11;
};

// SpanConfigBound is used to constrain the possible values a SpanConfig may
Expand Down
Loading

0 comments on commit 4fb92bf

Please sign in to comment.