Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROX-26763: Enhance the handling of failures in procfsscraper #1987

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion collector/lib/CollectorConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ std::ostream& operator<<(std::ostream& os, const CollectorConfig& c) {
<< ", collect_connection_status:" << c.CollectConnectionStatus()
<< ", enable_detailed_metrics:" << c.EnableDetailedMetrics()
<< ", enable_external_ips:" << c.EnableExternalIPs()
<< ", track_send_recv:" << c.TrackingSendRecv();
<< ", track_send_recv:" << c.TrackingSendRecv()
<< ", enable_introspection:" << c.IsIntrospectionEnabled();
}

// Returns size of ring buffers to be allocated.
Expand Down
1 change: 1 addition & 0 deletions collector/lib/CollectorStats.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
X(procfs_could_not_get_socket_inodes) \
X(procfs_could_not_read_exe) \
X(procfs_could_not_read_cmdline) \
X(procfs_zombie_process) \
X(event_timestamp_distant_past) \
X(event_timestamp_future)

Expand Down
51 changes: 48 additions & 3 deletions collector/lib/ProcfsScraper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <cctype>
#include <cinttypes>
#include <cstdio>
#include <cstring>
#include <fcntl.h>
#include <string_view>
Expand Down Expand Up @@ -143,6 +144,23 @@ bool GetSocketINodes(int dirfd, uint64_t pid, UnorderedSet<SocketInfo>* sock_ino
return true;
}

// Fetches the current state of the process pointed to by dirfd
// returns nulopt in case of error
std::optional<char> ReadProcessState(int dirfd) {
FileHandle stat_file(FDHandle(openat(dirfd, "stat", O_RDONLY)), "r");
if (!stat_file.valid()) {
return false;
}

char linebuf[512];

if (fgets(linebuf, sizeof(linebuf), stat_file.get()) == nullptr) {
return false;
}

return ExtractProcessState(linebuf);
}

// GetContainerID retrieves the container ID of the process represented by dirfd. The container ID is extracted from
// the cgroup.
std::optional<std::string> GetContainerID(int dirfd) {
Expand Down Expand Up @@ -491,16 +509,24 @@ bool ReadContainerConnections(const char* proc_path, std::shared_ptr<ProcessStor
continue;
}

auto process_state = ReadProcessState(dirfd);
if (process_state && *process_state == 'Z') {
COUNTER_INC(CollectorStats::procfs_zombie_process);
continue;
}

auto container_id = GetContainerID(dirfd);
if (!container_id) {
continue;
}

uint64_t netns_inode;
if (!GetNetworkNamespace(dirfd, &netns_inode)) {
// TODO ROX-13962: Improve logging to indicate when a process is defunct.
COUNTER_INC(CollectorStats::procfs_could_not_get_network_namespace);
CLOG_THROTTLED(ERROR, std::chrono::seconds(10)) << "Could not determine network namespace: " << StrError();
CLOG(TRACE) << "Could not determine network namespace: " << StrError();
if (process_state) {
CLOG(TRACE) << "Process state: " << *process_state;
}
continue;
}

Expand All @@ -509,7 +535,10 @@ bool ReadContainerConnections(const char* proc_path, std::shared_ptr<ProcessStor

if (!GetSocketINodes(dirfd, pid, &container_ns_sockets)) {
COUNTER_INC(CollectorStats::procfs_could_not_get_socket_inodes);
CLOG_THROTTLED(ERROR, std::chrono::seconds(10)) << "Could not obtain socket inodes: " << StrError();
CLOG(TRACE) << "Could not obtain socket inodes: " << StrError();
if (process_state) {
CLOG(TRACE) << "Process state: " << *process_state;
}
continue;
}

Expand Down Expand Up @@ -609,6 +638,22 @@ std::optional<std::string_view> ExtractContainerID(std::string_view cgroup_line)
return ExtractContainerIDFromCgroup(cgroup_path);
}

std::optional<char> ExtractProcessState(std::string_view line) {
size_t last_parenthese;

if ((last_parenthese = line.rfind(") ")) == line.npos) {
return {};
}

line.remove_prefix(last_parenthese + 2);

if (line.empty()) {
return {};
}

return line[0];
}

bool ConnScraper::Scrape(std::vector<Connection>* connections, std::vector<ContainerEndpoint>* listen_endpoints) {
return ReadContainerConnections(proc_path_.c_str(), process_store_, connections, listen_endpoints);
}
Expand Down
5 changes: 5 additions & 0 deletions collector/lib/ProcfsScraper_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ namespace collector {
// ExtractContainerID tries to extract a container ID from a cgroup line.
std::optional<std::string_view> ExtractContainerID(std::string_view cgroup_line);

// ExtractProcessState retrieves the state of the process (3rd element).
// as found in /proc/<pid>/stat.
// Returns: the state character or nullopt in case of error.
std::optional<char> ExtractProcessState(std::string_view proc_pid_stat_line);

} // namespace collector

#endif
21 changes: 21 additions & 0 deletions collector/test/ConnScraperTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,27 @@ TEST(ConnScraperTest, TestExtractContainerID) {
}
}

TEST(ConnScraperTest, TestProcStateExtract) {
std::optional<char> state;

// valid line
state = ExtractProcessState("13934 (prog) Z 2312 13934 2312 34819 13934 4194304 94 0 0 0 0 0 0 0 20 0 1 0 608787 5758976 409 18446744073709551615 94201870180352 94201870200233 140728860702192 0 0 0 0 0 0 0 0 0 17 4 0 0 0 0 0 94201870216240 94201870217856 94202687545344 140728860710184 140728860710204 140728860710204 140728860712939 0\n");

EXPECT_TRUE(state);
EXPECT_EQ(*state, 'Z');

// invalid
state = ExtractProcessState("13934 (prog) ");

EXPECT_FALSE(state);

// program name containing ')'
state = ExtractProcessState("13934 (prog ) Z) R 2312 13934 2312 34819 13934 4194304 94 0 0 0 0 0 0 0 20 0 1 0 608787 5758976 409 18446744073709551615 94201870180352 94201870200233 140728860702192 0 0 0 0 0 0 0 0 0 17 4 0 0 0 0 0 94201870216240 94201870217856 94202687545344 140728860710184 140728860710204 140728860710204 140728860712939 0\n");

EXPECT_TRUE(state);
EXPECT_EQ(*state, 'R');
}

} // namespace

} // namespace collector
Loading