-
Notifications
You must be signed in to change notification settings - Fork 292
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Defer remote components to avoid OkHttp class-loading side-effects #8131
Changes from all commits
92ca151
40c7583
84d2b6d
9b97d7a
2cdc507
9a09d98
9ee929b
c9cf18f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,6 @@ | |
import static datadog.trace.util.AgentThreadFactory.AgentThread.PROFILER_STARTUP; | ||
import static datadog.trace.util.AgentThreadFactory.AgentThread.TRACE_STARTUP; | ||
import static datadog.trace.util.AgentThreadFactory.newAgentThread; | ||
import static datadog.trace.util.Strings.getResourceName; | ||
import static datadog.trace.util.Strings.propertyNameToSystemPropertyName; | ||
import static datadog.trace.util.Strings.toEnvVar; | ||
|
||
|
@@ -348,7 +347,7 @@ public void run() { | |
* logging facility. Likewise on IBM JDKs OkHttp may indirectly load 'IBMSASL' which in turn loads LogManager. | ||
*/ | ||
InstallDatadogTracerCallback installDatadogTracerCallback = | ||
new InstallDatadogTracerCallback(initTelemetry, inst); | ||
new InstallDatadogTracerCallback(initTelemetry, inst, delayOkHttp); | ||
if (delayOkHttp) { | ||
log.debug("Custom logger detected. Delaying Datadog Tracer initialization."); | ||
registerLogManagerCallback(installDatadogTracerCallback); | ||
|
@@ -497,28 +496,21 @@ public void execute() { | |
} | ||
|
||
protected static class InstallDatadogTracerCallback extends ClassLoadCallBack { | ||
private final InitializationTelemetry initTelemetry; | ||
private final Instrumentation instrumentation; | ||
private final Object sco; | ||
private final Class<?> scoClass; | ||
private final boolean delayOkHttp; | ||
|
||
public InstallDatadogTracerCallback( | ||
InitializationTelemetry initTelemetry, Instrumentation instrumentation) { | ||
this.initTelemetry = initTelemetry; | ||
InitializationTelemetry initTelemetry, | ||
Instrumentation instrumentation, | ||
boolean delayOkHttp) { | ||
this.delayOkHttp = delayOkHttp; | ||
this.instrumentation = instrumentation; | ||
} | ||
|
||
@Override | ||
public AgentThread agentThread() { | ||
return TRACE_STARTUP; | ||
} | ||
|
||
@Override | ||
public void execute() { | ||
Object sco; | ||
Class<?> scoClass; | ||
try { | ||
scoClass = | ||
AGENT_CLASSLOADER.loadClass("datadog.communication.ddagent.SharedCommunicationObjects"); | ||
sco = scoClass.getConstructor().newInstance(); | ||
sco = scoClass.getConstructor(boolean.class).newInstance(delayOkHttp); | ||
} catch (ClassNotFoundException | ||
| NoSuchMethodException | ||
| InstantiationException | ||
|
@@ -528,10 +520,23 @@ public void execute() { | |
} | ||
|
||
installDatadogTracer(initTelemetry, scoClass, sco); | ||
maybeInstallLogsIntake(scoClass, sco); | ||
} | ||
|
||
@Override | ||
public AgentThread agentThread() { | ||
return TRACE_STARTUP; | ||
} | ||
|
||
@Override | ||
public void execute() { | ||
if (delayOkHttp) { | ||
resumeRemoteComponents(); | ||
} | ||
|
||
maybeStartAppSec(scoClass, sco); | ||
maybeStartIast(instrumentation, scoClass, sco); | ||
maybeStartCiVisibility(instrumentation, scoClass, sco); | ||
maybeStartLogsIntake(scoClass, sco); | ||
// start debugger before remote config to subscribe to it before starting to poll | ||
maybeStartDebugger(instrumentation, scoClass, sco); | ||
maybeStartRemoteConfig(scoClass, sco); | ||
|
@@ -540,6 +545,18 @@ public void execute() { | |
startTelemetry(instrumentation, scoClass, sco); | ||
} | ||
} | ||
|
||
private void resumeRemoteComponents() { | ||
try { | ||
// remote components were paused for custom log-manager/jmx-builder | ||
// add small delay before resuming remote I/O to help stabilization | ||
Thread.sleep(1_000); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dumb question: is this an empirical delay? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes - having no delay or a delay of a few 100ms caused issues because the custom log manager might not have finished its setup by the time the delay expires. The resulting JUL class-loading triggered from this thread via OkHttp could then lead to the logging setup not being complete, which can in turn lead to missing logging. There's no good class-loader signal to tell us the custom log manager is ready, only when JUL has been touched (and if we try to be too smart then we might miss the signal, which would lead to us never resuming remote I/O) So this is an empirical practical solution which stabilizes the test locally and on CI. That's also why it's important to install the tracer (and logs-intake collector) early so they can capture data while holding back the remote I/O. That way we avoid triggering JUL issues while not losing trace or log data. Also note this code path is only used for very specific setups, specifically Java 8 with JFR or IBM JDKs. |
||
scoClass.getMethod("resume").invoke(sco); | ||
} catch (InterruptedException ignore) { | ||
} catch (Throwable e) { | ||
log.error("Error resuming remote components", e); | ||
} | ||
} | ||
} | ||
|
||
protected static class StartProfilingAgentCallback extends ClassLoadCallBack { | ||
|
@@ -866,17 +883,18 @@ private static void maybeStartCiVisibility(Instrumentation inst, Class<?> scoCla | |
} | ||
} | ||
|
||
private static void maybeStartLogsIntake(Class<?> scoClass, Object sco) { | ||
private static void maybeInstallLogsIntake(Class<?> scoClass, Object sco) { | ||
if (agentlessLogSubmissionEnabled) { | ||
StaticEventLogger.begin("Logs Intake"); | ||
|
||
try { | ||
final Class<?> logsIntakeSystemClass = | ||
AGENT_CLASSLOADER.loadClass("datadog.trace.logging.intake.LogsIntakeSystem"); | ||
final Method logsIntakeInstallerMethod = logsIntakeSystemClass.getMethod("start", scoClass); | ||
final Method logsIntakeInstallerMethod = | ||
logsIntakeSystemClass.getMethod("install", scoClass); | ||
logsIntakeInstallerMethod.invoke(null, sco); | ||
} catch (final Throwable e) { | ||
log.warn("Not starting Logs Intake subsystem", e); | ||
log.warn("Not installing Logs Intake subsystem", e); | ||
} | ||
|
||
StaticEventLogger.end("Logs Intake"); | ||
|
@@ -1267,14 +1285,8 @@ private static boolean isAppUsingCustomLogManager(final EnumSet<Library> librari | |
|
||
final String logManagerProp = System.getProperty("java.util.logging.manager"); | ||
if (logManagerProp != null) { | ||
final boolean onSysClasspath = | ||
ClassLoader.getSystemResource(getResourceName(logManagerProp)) != null; | ||
log.debug("Prop - logging.manager: {}", logManagerProp); | ||
log.debug("logging.manager on system classpath: {}", onSysClasspath); | ||
// Some applications set java.util.logging.manager but never actually initialize the logger. | ||
// Check to see if the configured manager is on the system classpath. | ||
// If so, it should be safe to initialize jmxfetch which will setup the log manager. | ||
return !onSysClasspath; | ||
return true; | ||
} | ||
|
||
return false; | ||
|
@@ -1305,14 +1317,8 @@ private static boolean isAppUsingCustomJMXBuilder(final EnumSet<Library> librari | |
|
||
final String jmxBuilderProp = System.getProperty("javax.management.builder.initial"); | ||
if (jmxBuilderProp != null) { | ||
final boolean onSysClasspath = | ||
ClassLoader.getSystemResource(getResourceName(jmxBuilderProp)) != null; | ||
log.debug("Prop - javax.management.builder.initial: {}", jmxBuilderProp); | ||
log.debug("javax.management.builder.initial on system classpath: {}", onSysClasspath); | ||
// Some applications set javax.management.builder.initial but never actually initialize JMX. | ||
// Check to see if the configured JMX builder is on the system classpath. | ||
// If so, it should be safe to initialize jmxfetch which will setup JMX. | ||
return !onSysClasspath; | ||
return true; | ||
} | ||
|
||
return false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the whole module not testable or it the cost too high?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's the double-checked lock that tips things over - attempting to cover all the cases for this piece of code would be very intensive (you'd end up testing the narrow volatile visibility edge.) There are other branches in
SharedCommunicationObjects
which are not currently tested which is why we're at this boundary to begin with, but adding tests for those unrelated pieces of code in this PR is IMHO confusing for future reviewers.I could just exclude
SharedCommunicationObjects
from branch coverage completely, but that feels worse - it's already excluded from instrumentation coverage - so reducing the coverage requirement here by a small amount is the least worst option.The proper solution is to add tests to increase coverage in a separate PR, as a separate work item.