Skip to content

Commit

Permalink
tmpnet: Improve subnet configuration (#2871)
Browse files Browse the repository at this point in the history
Signed-off-by: Stephen Buttolph <[email protected]>
Co-authored-by: Stephen Buttolph <[email protected]>
  • Loading branch information
marun and StephenButtolph authored Mar 27, 2024
1 parent 2f16057 commit cb9386f
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 67 deletions.
8 changes: 5 additions & 3 deletions tests/fixture/tmpnet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,11 @@ HOME
├── config.json // Common configuration (including defaults and pre-funded keys)
├── genesis.json // Genesis for all nodes
├── network.env // Sets network dir env var to simplify network usage
└── subnets // Parent directory for subnet definitions
├─ subnet-a.json // Configuration for subnet-a and its chain(s)
└─ subnet-b.json // Configuration for subnet-b and its chain(s)
└── subnets // Directory containing subnet config for both avalanchego and tmpnet
├── subnet-a.json // tmpnet configuration for subnet-a and its chain(s)
├── subnet-b.json // tmpnet configuration for subnet-b and its chain(s)
└── 2jRbWtaonb2RP8DEM5DBsd7o2o8d...RqNs9 // The ID of a subnet is the name of its configuration dir
└── config.json // avalanchego configuration for subnet
```

### Common networking configuration
Expand Down
161 changes: 110 additions & 51 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,37 @@ func (n *Network) StartNode(ctx context.Context, w io.Writer, node *Node) error
return nil
}

// Restart a single node.
func (n *Network) RestartNode(ctx context.Context, w io.Writer, node *Node) error {
// Ensure the node reuses the same API port across restarts to ensure
// consistent labeling of metrics. Otherwise prometheus's automatic
// addition of the `instance` label (host:port) results in
// segmentation of results for a given node every time the port
// changes on restart. This segmentation causes graphs on the grafana
// dashboards to display multiple series per graph for a given node,
// one for each port that the node used.
//
// There is a non-zero chance of the port being allocatted to a
// different process and the node subsequently being unable to start,
// but the alternative is having to update the grafana dashboards
// query-by-query to ensure that node metrics ignore the instance
// label.
if err := node.SaveAPIPort(); err != nil {
return err
}

if err := node.Stop(ctx); err != nil {
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
}
if err := n.StartNode(ctx, w, node); err != nil {
return fmt.Errorf("failed to start node %s: %w", node.NodeID, err)
}
if _, err := fmt.Fprintf(w, " waiting for node %s to report healthy\n", node.NodeID); err != nil {
return err
}
return WaitForHealthy(ctx, node)
}

// Waits until all nodes in the network are healthy.
func (n *Network) WaitForHealthy(ctx context.Context, w io.Writer) error {
ticker := time.NewTicker(networkHealthCheckInterval)
Expand Down Expand Up @@ -441,33 +472,7 @@ func (n *Network) Restart(ctx context.Context, w io.Writer) error {
return err
}
for _, node := range n.Nodes {
// Ensure the node reuses the same API port across restarts to ensure
// consistent labeling of metrics. Otherwise prometheus's automatic
// addition of the `instance` label (host:port) results in
// segmentation of results for a given node every time the port
// changes on restart. This segmentation causes graphs on the grafana
// dashboards to display multiple series per graph for a given node,
// one for each port that the node used.
//
// There is a non-zero chance of the port being allocatted to a
// different process and the node subsequently being unable to start,
// but the alternative is having to update the grafana dashboards
// query-by-query to ensure that node metrics ignore the instance
// label.
if err := node.SaveAPIPort(); err != nil {
return err
}

if err := node.Stop(ctx); err != nil {
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
}
if err := n.StartNode(ctx, w, node); err != nil {
return fmt.Errorf("failed to start node %s: %w", node.NodeID, err)
}
if _, err := fmt.Fprintf(w, " waiting for node %s to report healthy\n", node.NodeID); err != nil {
return err
}
if err := WaitForHealthy(ctx, node); err != nil {
if err := n.RestartNode(ctx, w, node); err != nil {
return err
}
}
Expand Down Expand Up @@ -501,10 +506,20 @@ func (n *Network) EnsureNodeConfig(node *Node) error {

// Set fields including the network path
if len(n.Dir) > 0 {
node.Flags.SetDefaults(FlagsMap{
defaultFlags := FlagsMap{
config.GenesisFileKey: n.getGenesisPath(),
config.ChainConfigDirKey: n.getChainConfigDir(),
})
}

// Only set the subnet dir if it exists or the node won't start.
subnetDir := n.getSubnetDir()
if _, err := os.Stat(subnetDir); err == nil {
defaultFlags[config.SubnetConfigDirKey] = subnetDir
} else if !errors.Is(err, os.ErrNotExist) {
return err
}

node.Flags.SetDefaults(defaultFlags)

// Ensure the node's data dir is configured
dataDir := node.getDataDir()
Expand All @@ -522,17 +537,26 @@ func (n *Network) EnsureNodeConfig(node *Node) error {
}
}

// Ensure available subnets are tracked
return nil
}

// TrackedSubnetsForNode returns the subnet IDs for the given node
func (n *Network) TrackedSubnetsForNode(nodeID ids.NodeID) string {
subnetIDs := make([]string, 0, len(n.Subnets))
for _, subnet := range n.Subnets {
if subnet.SubnetID == ids.Empty {
// Subnet has not yet been created
continue
}
subnetIDs = append(subnetIDs, subnet.SubnetID.String())
// Only track subnets that this node validates
for _, validatorID := range subnet.ValidatorIDs {
if validatorID == nodeID {
subnetIDs = append(subnetIDs, subnet.SubnetID.String())
break
}
}
}
flags[config.TrackSubnetsKey] = strings.Join(subnetIDs, ",")

return nil
return strings.Join(subnetIDs, ",")
}

func (n *Network) GetSubnet(name string) *Subnet {
Expand All @@ -544,18 +568,22 @@ func (n *Network) GetSubnet(name string) *Subnet {
return nil
}

// Ensure that each subnet on the network is created and that it is validated by all non-ephemeral nodes.
// Ensure that each subnet on the network is created.
func (n *Network) CreateSubnets(ctx context.Context, w io.Writer) error {
createdSubnets := make([]*Subnet, 0, len(n.Subnets))
for _, subnet := range n.Subnets {
if _, err := fmt.Fprintf(w, "Creating subnet %q\n", subnet.Name); err != nil {
return err
if len(subnet.ValidatorIDs) == 0 {
return fmt.Errorf("subnet %s needs at least one validator", subnet.SubnetID)
}
if subnet.SubnetID != ids.Empty {
// The subnet already exists
continue
}

if _, err := fmt.Fprintf(w, "Creating subnet %q\n", subnet.Name); err != nil {
return err
}

if subnet.OwningKey == nil {
// Allocate a pre-funded key and remove it from the network so it won't be used for
// other purposes
Expand Down Expand Up @@ -599,34 +627,53 @@ func (n *Network) CreateSubnets(ctx context.Context, w io.Writer) error {
return err
}

// Reconfigure nodes for the new subnets
if _, err := fmt.Fprintf(w, "Configured nodes to track new subnet(s). Restart is required.\n"); err != nil {
if _, err := fmt.Fprintf(w, "Restarting node(s) to enable them to track the new subnet(s)\n"); err != nil {
return err
}
reconfiguredNodes := []*Node{}
for _, node := range n.Nodes {
if err := n.EnsureNodeConfig(node); err != nil {
existingTrackedSubnets, err := node.Flags.GetStringVal(config.TrackSubnetsKey)
if err != nil {
return err
}
trackedSubnets := n.TrackedSubnetsForNode(node.NodeID)
if existingTrackedSubnets == trackedSubnets {
continue
}
node.Flags[config.TrackSubnetsKey] = trackedSubnets
reconfiguredNodes = append(reconfiguredNodes, node)
}
// Restart nodes to allow new configuration to take effect
// TODO(marun) Only restart the validator nodes of newly-created subnets
if err := n.Restart(ctx, w); err != nil {
return err
for _, node := range reconfiguredNodes {
if err := n.RestartNode(ctx, w, node); err != nil {
return err
}
}

// Add each node as a subnet validator
// Add validators for the subnet
for _, subnet := range createdSubnets {
if _, err := fmt.Fprintf(w, "Adding validators for subnet %q\n", subnet.Name); err != nil {
return err
}
if err := subnet.AddValidators(ctx, w, n.Nodes); err != nil {

// Collect the nodes intended to validate the subnet
validatorIDs := set.NewSet[ids.NodeID](len(subnet.ValidatorIDs))
validatorIDs.Add(subnet.ValidatorIDs...)
validatorNodes := []*Node{}
for _, node := range n.Nodes {
if !validatorIDs.Contains(node.NodeID) {
continue
}
validatorNodes = append(validatorNodes, node)
}

if err := subnet.AddValidators(ctx, w, validatorNodes...); err != nil {
return err
}
}

// Wait for nodes to become subnet validators
pChainClient := platformvm.NewClient(n.Nodes[0].URI)
restartRequired := false
validatorsToRestart := set.Set[ids.NodeID]{}
for _, subnet := range createdSubnets {
if err := waitForActiveValidators(ctx, w, pChainClient, subnet); err != nil {
return err
Expand All @@ -649,17 +696,29 @@ func (n *Network) CreateSubnets(ctx context.Context, w io.Writer) error {
// subnet's validator nodes will need to be restarted for those nodes to read
// the newly written chain configuration and apply it to the chain(s).
if subnet.HasChainConfig() {
restartRequired = true
validatorsToRestart.Add(subnet.ValidatorIDs...)
}
}

if !restartRequired {
if len(validatorsToRestart) == 0 {
return nil
}

if _, err := fmt.Fprintf(w, "Restarting node(s) to pick up chain configuration\n"); err != nil {
return err
}

// Restart nodes to allow configuration for the new chains to take effect
// TODO(marun) Only restart the validator nodes of subnets that have chains that need configuring
return n.Restart(ctx, w)
for _, node := range n.Nodes {
if !validatorsToRestart.Contains(node.NodeID) {
continue
}
if err := n.RestartNode(ctx, w, node); err != nil {
return err
}
}

return nil
}

func (n *Network) GetURIForNodeID(nodeID ids.NodeID) (string, error) {
Expand Down
Loading

0 comments on commit cb9386f

Please sign in to comment.