-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
support.go
232 lines (203 loc) · 7.31 KB
/
support.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package aws
import (
"bytes"
"context"
"encoding/json"
"io/ioutil"
"os/exec"
"strings"
"text/template"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
)
// Both M5 and I3 machines expose their EBS or local SSD volumes as NVMe block
// devices, but the actual device numbers vary a bit between the two types.
// This user-data script will create a filesystem, mount the data volume, and
// chmod 777.
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html
//
// This is a template because the instantiator needs to optionally configure the
// mounting options. The script cannot take arguments since it is to be invoked
// by the aws tool which cannot pass args.
const awsStartupScriptTemplate = `#!/usr/bin/env bash
# Script for setting up a AWS machine for roachprod use.
set -x
sudo apt-get update
sudo apt-get install -qy --no-install-recommends mdadm
mount_opts="defaults"
{{if .ExtraMountOpts}}mount_opts="${mount_opts},{{.ExtraMountOpts}}"{{end}}
use_multiple_disks='{{if .UseMultipleDisks}}true{{end}}'
disks=()
mount_prefix="/mnt/data"
# On different machine types, the drives are either called nvme... or xvdd.
for d in $(ls /dev/nvme?n1 /dev/xvdd); do
if ! mount | grep ${d}; then
disks+=("${d}")
echo "Disk ${d} not mounted, need to mount..."
else
echo "Disk ${d} already mounted, skipping..."
fi
done
if [ "${#disks[@]}" -eq "0" ]; then
mountpoint="${mount_prefix}1"
echo "No disks mounted, creating ${mountpoint}"
mkdir -p ${mountpoint}
chmod 777 ${mountpoint}
elif [ "${#disks[@]}" -eq "1" ] || [ -n "$use_multiple_disks" ]; then
disknum=1
for disk in "${disks[@]}"
do
mountpoint="${mount_prefix}${disknum}"
disknum=$((disknum + 1 ))
echo "Mounting ${disk} at ${mountpoint}"
mkdir -p ${mountpoint}
mkfs.ext4 -F ${disk}
mount -o ${mount_opts} ${disk} ${mountpoint}
chmod 777 ${mountpoint}
echo "${disk} ${mountpoint} ext4 ${mount_opts} 1 1" | tee -a /etc/fstab
done
else
mountpoint="${mount_prefix}1"
echo "${#disks[@]} disks mounted, creating ${mountpoint} using RAID 0"
mkdir -p ${mountpoint}
raiddisk="/dev/md0"
mdadm --create ${raiddisk} --level=0 --raid-devices=${#disks[@]} "${disks[@]}"
mkfs.ext4 -F ${raiddisk}
mount -o ${mount_opts} ${raiddisk} ${mountpoint}
chmod 777 ${mountpoint}
echo "${raiddisk} ${mountpoint} ext4 ${mount_opts} 1 1" | tee -a /etc/fstab
fi
sudo apt-get install -qy chrony
# Override the chrony config. In particular,
# log aggressively when clock is adjusted (0.01s)
# and exclusively use a single time server.
sudo cat <<EOF > /etc/chrony/chrony.conf
keyfile /etc/chrony/chrony.keys
commandkey 1
driftfile /var/lib/chrony/chrony.drift
log tracking measurements statistics
logdir /var/log/chrony
maxupdateskew 100.0
dumponexit
dumpdir /var/lib/chrony
logchange 0.01
hwclockfile /etc/adjtime
rtcsync
server 169.254.169.123 prefer iburst
makestep 0.1 3
EOF
sudo /etc/init.d/chrony restart
sudo chronyc -a waitsync 30 0.01 | sudo tee -a /root/chrony.log
# sshguard can prevent frequent ssh connections to the same host. Disable it.
sudo service sshguard stop
# increase the number of concurrent unauthenticated connections to the sshd
# daemon. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Load_Balancing.
# By default, only 10 unauthenticated connections are permitted before sshd
# starts randomly dropping connections.
sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config'
# Crank up the logging for issues such as:
# https://github.com/cockroachdb/cockroach/issues/36929
sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config
sudo service sshd restart
# increase the default maximum number of open file descriptors for
# root and non-root users. Load generators running a lot of concurrent
# workers bump into this often.
sudo sh -c 'echo "root - nofile 1048576\n* - nofile 1048576" > /etc/security/limits.d/10-roachprod-nofiles.conf'
# Enable core dumps
cat <<EOF > /etc/security/limits.d/core_unlimited.conf
* soft core unlimited
* hard core unlimited
root soft core unlimited
root hard core unlimited
EOF
mkdir -p /mnt/data1/cores
chmod a+w /mnt/data1/cores
CORE_PATTERN="/mnt/data1/cores/core.%e.%p.%h.%t"
echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern
sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport
sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf
echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf
sysctl --system # reload sysctl settings
sudo touch /mnt/data1/.roachprod-initialized
`
// writeStartupScript writes the startup script to a temp file.
// Returns the path to the file.
// After use, the caller should delete the temp file.
//
// extraMountOpts, if not empty, is appended to the default mount options. It is
// a comma-separated list of options for the "mount -o" flag.
func writeStartupScript(extraMountOpts string, useMultiple bool) (string, error) {
type tmplParams struct {
ExtraMountOpts string
UseMultipleDisks bool
}
args := tmplParams{ExtraMountOpts: extraMountOpts, UseMultipleDisks: useMultiple}
tmpfile, err := ioutil.TempFile("", "aws-startup-script")
if err != nil {
return "", err
}
defer tmpfile.Close()
t := template.Must(template.New("start").Parse(awsStartupScriptTemplate))
if err := t.Execute(tmpfile, args); err != nil {
return "", err
}
return tmpfile.Name(), nil
}
// runCommand is used to invoke an AWS command.
func (p *Provider) runCommand(args []string) ([]byte, error) {
if p.Profile != "" {
args = append(args[:len(args):len(args)], "--profile", p.Profile)
}
var stderrBuf bytes.Buffer
cmd := exec.Command("aws", args...)
cmd.Stderr = &stderrBuf
output, err := cmd.Output()
if err != nil {
if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) {
log.Infof(context.Background(), "%s", string(exitErr.Stderr))
}
return nil, errors.Wrapf(err, "failed to run: aws %s: stderr: %v",
strings.Join(args, " "), stderrBuf.String())
}
return output, nil
}
// runJSONCommand invokes an aws command and parses the json output.
func (p *Provider) runJSONCommand(args []string, parsed interface{}) error {
// Force json output in case the user has overridden the default behavior.
args = append(args[:len(args):len(args)], "--output", "json")
rawJSON, err := p.runCommand(args)
if err != nil {
return err
}
if err := json.Unmarshal(rawJSON, &parsed); err != nil {
return errors.Wrapf(err, "failed to parse json %s", rawJSON)
}
return nil
}
// regionMap collates VM instances by their region.
func regionMap(vms vm.List) (map[string]vm.List, error) {
// Fan out the work by region
byRegion := make(map[string]vm.List)
for _, m := range vms {
region, err := zoneToRegion(m.Zone)
if err != nil {
return nil, err
}
byRegion[region] = append(byRegion[region], m)
}
return byRegion, nil
}
// zoneToRegion converts an availability zone like us-east-2a to the zone name us-east-2
func zoneToRegion(zone string) (string, error) {
return zone[0 : len(zone)-1], nil
}