Skip to content

Commit

Permalink
Support tmpfs and a server queue for C++ compilers (#10)
Browse files Browse the repository at this point in the history
This PR drops an option -working-dir, adding two new instead:
* -cpp-dir {string}, Directory for incoming C++ files and src cache
* -obj-dir {string}, Directory for resulting obj files and obj cache
The directory passed as -cpp-dir can be placed in tmpfs.

Also, nocc-server now managers C++ compiler launches with a waiting queue. 
The purpose of a waiting queue is not to over-utilize server resources at peak times.
  • Loading branch information
tolk-vm authored Apr 26, 2023
1 parent edc04d3 commit 73b26e5
Show file tree
Hide file tree
Showing 28 changed files with 806 additions and 482 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
RELEASE = v1.1.2
RELEASE = v1.2
BUILD_COMMIT := $(shell git rev-parse --short HEAD)
DATE := $(shell date -u '+%F %X UTC')
VERSION := ${RELEASE}, rev ${BUILD_COMMIT}, compiled at ${DATE}
Expand Down
9 changes: 6 additions & 3 deletions cmd/nocc-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,20 @@ func main() {
}

if *checkServersAndExit {
if len(remoteNoccHosts) == 0 {
failedStart("no remote hosts set; you should set NOCC_SERVERS or NOCC_SERVERS_FILENAME")
}
if len(os.Args) == 3 { // nocc -check-servers {remoteHostPort}
remoteNoccHosts = []string{os.Args[2]}
}
if len(remoteNoccHosts) == 0 {
failedStart("no remote hosts set; you should set NOCC_SERVERS or NOCC_SERVERS_FILENAME")
}
client.RequestRemoteStatus(remoteNoccHosts)
os.Exit(0)
}

if *dumpServerLogsAndExit {
if len(os.Args) == 3 { // nocc -dump-server-logs {remoteHostPort}
remoteNoccHosts = []string{os.Args[2]}
}
if len(remoteNoccHosts) == 0 {
failedStart("no remote hosts set; you should set NOCC_SERVERS or NOCC_SERVERS_FILENAME")
}
Expand Down
55 changes: 30 additions & 25 deletions cmd/nocc-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"fmt"
"net"
"os"
"path"
"runtime"
"time"

"github.com/VKCOM/nocc/internal/common"
Expand All @@ -18,24 +18,29 @@ func failedStart(message string, err error) {
os.Exit(1)
}

// cleanupWorkingDir ensures that workingDir exists and is empty
// prepareEmptyDir ensures that serverDir exists and is empty
// it's executed on server launch
// as a consequence, all file caches are lost on restart
func cleanupWorkingDir(workingDir string) error {
oldWorkingDir := workingDir + ".old"

if err := os.RemoveAll(oldWorkingDir); err != nil {
failedStart("can't remove old working dir", err)
}
if _, err := os.Stat(workingDir); err == nil {
if err := os.Rename(workingDir, oldWorkingDir); err != nil {
failedStart("can't rename working dir %s to .old", err)
func prepareEmptyDir(parentDir *string, subdir string) string {
// if /tmp/nocc/cpp/src-cache already exists, it means, that it contains files from a previous launch
// to start up as quickly as possible, do the following:
// 1) rename it to /tmp/nocc/cpp/src-cache.old
// 2) clear it recursively in the background
serverDir := *parentDir + "/" + subdir
if _, err := os.Stat(serverDir); err == nil {
oldDirRenamed := fmt.Sprintf("%s.old.%d", serverDir, time.Now().Unix())
if err := os.Rename(serverDir, oldDirRenamed); err != nil {
failedStart("can't rename "+serverDir, err)
}
go func() {
_ = os.RemoveAll(oldDirRenamed)
}()
}
if err := os.MkdirAll(workingDir, os.ModePerm); err != nil {
return err

if err := os.MkdirAll(serverDir, os.ModePerm); err != nil {
failedStart("can't create "+serverDir, err)
}
return nil
return serverDir
}

// printDockerContainerIP is a dev/debug function called only when build special for local Docker, for local testing.
Expand All @@ -58,8 +63,10 @@ func main() {
"host", "")
listenPort := common.CmdEnvInt("Listening port, default 43210.", 43210,
"port", "")
workingDir := common.CmdEnvString("Directory for saving incoming files, default /tmp/nocc-server.", "/tmp/nocc-server",
"working-dir", "")
cppStoreDir := common.CmdEnvString("Directory for incoming C++ files and src cache, default /tmp/nocc/cpp.\nIt can be placed in tmpfs to speed up compilation", "/tmp/nocc/cpp",
"cpp-dir", "")
objStoreDir := common.CmdEnvString("Directory for resulting obj files and obj cache, default /tmp/nocc/obj.", "/tmp/nocc/obj",
"obj-dir", "")
logFileName := common.CmdEnvString("A filename to log, by default use stderr.", "",
"log-filename", "")
logVerbosity := common.CmdEnvInt("Logger verbosity level for INFO (-1 off, default 0, max 2).\nErrors are logged always.", 0,
Expand All @@ -70,6 +77,8 @@ func main() {
"obj-cache-limit", "")
statsdHostPort := common.CmdEnvString("Statsd udp address (host:port), omitted by default.\nIf omitted, stats won't be written.", "",
"statsd", "")
maxParallelCxx := common.CmdEnvInt("Max amount of C++ compiler processes launched in parallel, other ready sessions are waiting in a queue.\nBy default, it's a number of CPUs on the current machine.", int64(runtime.NumCPU()),
"max-parallel-cxx", "")

common.ParseCmdFlagsCombiningWithEnv()

Expand All @@ -78,10 +87,6 @@ func main() {
os.Exit(0)
}

if err = cleanupWorkingDir(*workingDir); err != nil {
failedStart("Can't create working directory "+*workingDir, err)
}

if err = server.MakeLoggerServer(*logFileName, *logVerbosity); err != nil {
failedStart("Can't init logger", err)
}
Expand All @@ -95,12 +100,12 @@ func main() {
failedStart("Failed to connect to statsd", err)
}

s.ActiveClients, err = server.MakeClientsStorage(path.Join(*workingDir, "clients"))
s.ActiveClients, err = server.MakeClientsStorage(prepareEmptyDir(cppStoreDir, "clients"))
if err != nil {
failedStart("Failed to init clients hashtable", err)
}

s.CxxLauncher, err = server.MakeCxxLauncher()
s.CxxLauncher, err = server.MakeCxxLauncher(*maxParallelCxx)
if err != nil {
failedStart("Failed to init cxx launcher", err)
}
Expand All @@ -110,17 +115,17 @@ func main() {
failedStart("Failed to init system headers hashtable", err)
}

s.SrcFileCache, err = server.MakeSrcFileCache(path.Join(*workingDir, "src-cache"), *srcCacheLimit)
s.SrcFileCache, err = server.MakeSrcFileCache(prepareEmptyDir(cppStoreDir, "src-cache"), *srcCacheLimit)
if err != nil {
failedStart("Failed to init src file cache", err)
}

s.ObjFileCache, err = server.MakeObjFileCache(path.Join(*workingDir, "obj-cache"), *objCacheLimit)
s.ObjFileCache, err = server.MakeObjFileCache(prepareEmptyDir(objStoreDir, "obj-cache"), prepareEmptyDir(objStoreDir, "cxx-out"), *objCacheLimit)
if err != nil {
failedStart("Failed to init obj file cache", err)
}

s.PchCompilation, err = server.MakePchCompilation(path.Join(*workingDir, "pch"))
s.PchCompilation, err = server.MakePchCompilation(prepareEmptyDir(cppStoreDir, "pch"))
if err != nil {
failedStart("Failed to init pch compilation", err)
}
Expand Down
16 changes: 1 addition & 15 deletions cmd/nocc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ char *format_time_to_log() {
static char time_buf[64];
time_t ts = time(nullptr);
tm *now = localtime(&ts);
sprintf(time_buf, "%d/%02d/%02d %02d:%02d:%02d", 1900 + now->tm_year, 1 + now->tm_mon, now->tm_mday, now->tm_hour, now->tm_min, now->tm_sec);
sprintf(time_buf, "%d-%02d-%02d %02d:%02d:%02d", 1900 + now->tm_year, 1 + now->tm_mon, now->tm_mday, now->tm_hour, now->tm_min, now->tm_sec);
return time_buf;
}

Expand Down Expand Up @@ -97,13 +97,6 @@ void __attribute__((noreturn)) execute_cxx_locally(const char *errToPrint, int e
exit(1);
}

void __attribute__((noreturn)) execute_distcc_locally() {
ARGV[0] = strdup("distcc");
execvp("distcc", ARGV + 0);
printf("could not run `distcc`, exit(1)\n");
exit(1);
}

void __attribute__((noreturn)) execute_go_nocc_instead_of_cpp() {
execv(NOCC_GO_EXECUTABLE, ARGV);
printf("could not run %s, exit(1)\n", NOCC_GO_EXECUTABLE);
Expand Down Expand Up @@ -279,13 +272,6 @@ int main(int argc, char *argv[]) {
exit(1);
}

// this possible fallback will be available for some time just in case
char *env_fallback_to_distcc = getenv("NOCC_FALLBACK_TO_DISTCC");
bool fallback_to_distcc = env_fallback_to_distcc != nullptr && env_fallback_to_distcc[0] == '1';
if (fallback_to_distcc) {
execute_distcc_locally();
}

if (ARGC == 2 && !strcmp(ARGV[1], "start")) {
int sockfd = connect_to_go_daemon_or_start_a_new_one();
exit(sockfd == -1 ? 1 : 0);
Expand Down
42 changes: 32 additions & 10 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,18 @@ When you launch lots of jobs like `make -j 600`, then `nocc-daemon` has to maint
All configuration on a server-side is done using command-line arguments.
For a server, they are more reliable than environment variables.

| Cmd argument | Description |
|--------------------------|-----------------------------------------------------------------------------------------|
| `-host {string}` | Binding address, default 0.0.0.0. |
| `-port {int}` | Listening port, default 43210. |
| `-working-dir {string}` | Directory for saving incoming files, default */tmp/nocc-server*. |
| `-log-filename {string}` | A filename to log, by default use stderr. |
| `-log-verbosity {int}` | Logger verbosity level for INFO (-1 off, default 0, max 2). Errors are logged always. |
| `-src-cache-limit {int}` | Header and source cache limit, in bytes, default 4G. |
| `-obj-cache-limit {int}` | Compiled obj cache limit, in bytes, default 16G. |
| `-statsd {string}` | Statsd udp address (host:port), omitted by default. If omitted, stats won't be written. |
| Cmd argument | Description |
|---------------------------|-----------------------------------------------------------------------------------------|
| `-host {string}` | Binding address, default 0.0.0.0. |
| `-port {int}` | Listening port, default 43210. |
| `-cpp-dir {string}` | Directory for incoming C++ files and src cache, default */tmp/nocc/cpp*. |
| `-obj-dir {string}` | Directory for resulting obj files and obj cache, default */tmp/nocc/obj*. |
| `-log-filename {string}` | A filename to log, by default use stderr. |
| `-log-verbosity {int}` | Logger verbosity level for INFO (-1 off, default 0, max 2). Errors are logged always. |
| `-src-cache-limit {int}` | Header and source cache limit, in bytes, default 4G. |
| `-obj-cache-limit {int}` | Compiled obj cache limit, in bytes, default 16G. |
| `-statsd {string}` | Statsd udp address (host:port), omitted by default. If omitted, stats won't be written. |
| `-max-parallel-cxx {int}` | Max amount of C++ compiler processes launched in parallel, default *nCPU*. |

All file caches are lost on restart, as references to files are kept in memory.
There is also an LRU expiration mechanism to fit cache limits.
Expand Down Expand Up @@ -75,6 +77,26 @@ A list of all written stats could be obtained [inside statsd.go](../internal/ser
They are quite intuitive, that's why we don't duplicate them here.


<p><br></p>

## Configuring nocc + tmpfs

The directory passed as `-cpp-dir` can be placed in **tmpfs**.
All operations with cpp files are performed in that directory:
* incoming files (h/cpp/etc.) are saved there mirroring client's file structure;
* src-cache is placed there;
* pch files are placed there;
* tmp files for preventing race conditions are also there, not in sys tmp dir.

So, if that directory is placed in tmpfs, the C++ compiler will take all files from memory (except for system headers),
which noticeably speeds up compilation.

When setting up limits to tmpfs in a system, ensure that it will fit `-src-cache-limit` plus some extra space.

Note, that placing `-obj-dir` in tmpfs is not recommended, because obj files are usually much heavier,
and they are just transparently streamed back from a hard disk in chunks.


<p><br></p>

## Other commands from a client
Expand Down
4 changes: 2 additions & 2 deletions internal/client/compile-remotely.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func CompileCppRemotely(daemon *Daemon, cwd string, invocation *Invocation, remo
return 0, nil, nil, err
}

logClient.Info(1, "remote", remote.remoteHostPort, "sessionID", invocation.sessionID, "waiting", len(fileIndexesToUpload), "uploads", invocation.cppInFile)
logClient.Info(1, "remote", remote.remoteHost, "sessionID", invocation.sessionID, "waiting", len(fileIndexesToUpload), "uploads", invocation.cppInFile)
logClient.Info(2, "checked", len(requiredFiles), "files whether upload is needed or they exist on remote")
invocation.summary.AddTiming("remote_session")

Expand All @@ -75,7 +75,7 @@ func CompileCppRemotely(daemon *Daemon, cwd string, invocation *Invocation, remo

// Now, we have a resulting .o file placed in a path determined by -o from command line.
if exitCode != 0 {
logClient.Info(0, "remote C++ compiler exited with code", exitCode, "sessionID", invocation.sessionID, invocation.cppInFile, remote.remoteHostPort)
logClient.Info(0, "remote C++ compiler exited with code", exitCode, "sessionID", invocation.sessionID, invocation.cppInFile, remote.remoteHost)
logClient.Info(1, "cxxExitCode:", exitCode, "sessionID", invocation.sessionID, "\ncxxStdout:", strings.TrimSpace(string(invocation.cxxStdout)), "\ncxxStderr:", strings.TrimSpace(string(invocation.cxxStderr)))
} else {
logClient.Info(2, "saved obj file to", invocation.objOutFile)
Expand Down
55 changes: 40 additions & 15 deletions internal/client/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
)

const (
timeoutForceInterruptInvocation = 5 * time.Minute
timeoutForceInterruptInvocation = 8 * time.Minute
)

// Daemon is created once, in a separate process `nocc-daemon`, which is listening for connections via unix socket.
Expand All @@ -37,6 +37,7 @@ type Daemon struct {

listener *DaemonUnixSockListener
remoteConnections []*RemoteConnection
allRemotesDelim string
localCxxThrottle chan struct{}

disableObjCache bool
Expand Down Expand Up @@ -77,31 +78,55 @@ func detectHostUserName() string {
return curUser.Username
}

func MakeDaemon(remoteNoccHosts []string, disableObjCache bool, disableOwnIncludes bool, localCxxQueueSize int64) (*Daemon, error) {
func MakeDaemon(remoteNoccHosts []string, disableObjCache bool, disableOwnIncludes bool, maxLocalCxxProcesses int64) (*Daemon, error) {
// send env NOCC_SERVERS on connect everywhere
// this is for debugging purpose: in production, all clients should have the same servers list
// to ensure this, just grep server logs: only one unique string should appear
allRemotesDelim := ""
for _, remoteHostPort := range remoteNoccHosts {
if allRemotesDelim != "" {
allRemotesDelim += ","
}
allRemotesDelim += ExtractRemoteHostWithoutPort(remoteHostPort)
}

// env NOCC_SERVERS and others are supposed to be the same between `nocc` invocations
// (in practice, this is true, as the first `nocc` invocation has no precedence over any other in a bunch)
daemon := &Daemon{
startTime: time.Now(),
quitChan: make(chan int),
clientID: detectClientID(),
hostUserName: detectHostUserName(),
remoteConnections: make([]*RemoteConnection, 0, len(remoteNoccHosts)),
localCxxThrottle: make(chan struct{}, localCxxQueueSize),
remoteConnections: make([]*RemoteConnection, len(remoteNoccHosts)),
allRemotesDelim: allRemotesDelim,
localCxxThrottle: make(chan struct{}, maxLocalCxxProcesses),
disableOwnIncludes: disableOwnIncludes,
disableObjCache: disableObjCache,
disableLocalCxx: localCxxQueueSize == 0,
disableLocalCxx: maxLocalCxxProcesses == 0,
activeInvocations: make(map[uint32]*Invocation, 300),
includesCache: make(map[string]*IncludesCache, 1),
}

for _, remoteHostPort := range remoteNoccHosts {
remote, err := MakeRemoteConnection(daemon, remoteHostPort, 1, 1)
if err != nil {
remote.isUnavailable = true
logClient.Error("error connecting to", remoteHostPort, err)
}
daemon.remoteConnections = append(daemon.remoteConnections, remote)
// connect to all remotes in parallel
wg := sync.WaitGroup{}
wg.Add(len(remoteNoccHosts))

ctxConnect, cancelFunc := context.WithTimeout(context.Background(), 5000*time.Millisecond)
defer cancelFunc()

for index, remoteHostPort := range remoteNoccHosts {
go func(index int, remoteHostPort string) {
remote, err := MakeRemoteConnection(daemon, remoteHostPort, ctxConnect)
if err != nil {
remote.isUnavailable = true
logClient.Error("error connecting to", remoteHostPort, err)
}

daemon.remoteConnections[index] = remote
wg.Done()
}(index, remoteHostPort)
}
wg.Wait()

return daemon, nil
}
Expand Down Expand Up @@ -202,10 +227,10 @@ func (daemon *Daemon) HandleInvocation(req DaemonSockRequest) DaemonSockResponse
}

remote := daemon.chooseRemoteConnectionForCppCompilation(invocation.cppInFile)
invocation.summary.remoteHostPort = remote.remoteHostPort
invocation.summary.remoteHost = remote.remoteHost

if remote.isUnavailable {
return daemon.FallbackToLocalCxx(req, fmt.Errorf("remote %s is unavailable", remote.remoteHostPort))
return daemon.FallbackToLocalCxx(req, fmt.Errorf("remote %s is unavailable", remote.remoteHost))
}

daemon.mu.Lock()
Expand Down Expand Up @@ -292,7 +317,7 @@ func (daemon *Daemon) PeriodicallyInterruptHangedInvocations() {
daemon.mu.Lock()
for _, invocation := range daemon.activeInvocations {
if time.Since(invocation.createTime) > timeoutForceInterruptInvocation {
invocation.ForceInterrupt(fmt.Errorf("interrupt sessionID %d after %d sec timeout", invocation.sessionID, int(time.Since(invocation.createTime).Seconds())))
invocation.ForceInterrupt(fmt.Errorf("interrupt sessionID %d (%s) after %d sec timeout", invocation.sessionID, invocation.summary.remoteHost, int(time.Since(invocation.createTime).Seconds())))
}
}
daemon.mu.Unlock()
Expand Down
2 changes: 1 addition & 1 deletion internal/client/files-receiving.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func receiveObjFileByChunks(stream pb.CompilationService_RecvCompiledObjStreamCl
return errWrite, false
}

fileTmp, errWrite := common.OpenTempFile(objOutFile, false)
fileTmp, errWrite := common.OpenTempFile(objOutFile)
if errWrite == nil {
_, errWrite = fileTmp.Write(firstChunk.ChunkBody)
}
Expand Down
4 changes: 2 additions & 2 deletions internal/client/invocation-summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ type invocationTimingItem struct {
// It's mostly for developing/debugging purposes: multiple nocc invocations are appended to a single log file,
// from which we can compute statistics, average and percentiles, either in total or partitioned by hosts.
type InvocationSummary struct {
remoteHostPort string
remoteHost string

nIncludes int
nFilesSent int
Expand All @@ -44,7 +44,7 @@ func (s *InvocationSummary) ToLogString(invocation *Invocation) string {

b := strings.Builder{}
fmt.Fprintf(&b, "cppInFile=%q, remote=%s, sessionID=%d, nIncludes=%d, nFilesSent=%d, nBytesSent=%d, nBytesReceived=%d, cxxDuration=%dms",
invocation.cppInFile, s.remoteHostPort, invocation.sessionID, s.nIncludes, s.nFilesSent, s.nBytesSent, s.nBytesReceived, invocation.cxxDuration)
invocation.cppInFile, s.remoteHost, invocation.sessionID, s.nIncludes, s.nFilesSent, s.nBytesSent, s.nBytesReceived, invocation.cxxDuration)

prevTime := invocation.createTime
fmt.Fprintf(&b, ", started=0ms")
Expand Down
Loading

0 comments on commit 73b26e5

Please sign in to comment.