From 80a18076bbe1028cc5912a491bd05ef69a1dcbae Mon Sep 17 00:00:00 2001 From: Matheus Svolenski Date: Fri, 13 Oct 2023 15:21:10 +0200 Subject: [PATCH] New Relic Integration --- README.md | 44 ++++++++++++++++ buildpack/stage.py | 2 +- buildpack/telemetry/fluentbit.py | 81 +++++++++++++++++++++++------- buildpack/telemetry/metrics.py | 3 +- buildpack/telemetry/newrelic.py | 47 ++++++++++++++++- buildpack/telemetry/splunk.py | 24 +++------ buildpack/telemetry/telegraf.py | 16 +++++- etc/fluentbit/fluentbit.conf | 19 +++---- etc/fluentbit/metadata.lua | 9 ++-- etc/fluentbit/output_newrelic.conf | 5 ++ etc/fluentbit/output_splunk.conf | 9 ++++ etc/telegraf/telegraf.toml.j2 | 20 ++++++-- requirements.txt | 2 +- tests/integration/test_newrelic.py | 51 +++++++++++++++++++ 14 files changed, 272 insertions(+), 60 deletions(-) create mode 100644 etc/fluentbit/output_newrelic.conf create mode 100644 etc/fluentbit/output_splunk.conf create mode 100644 tests/integration/test_newrelic.py diff --git a/README.md b/README.md index 1bae52bb..53fba325 100644 --- a/README.md +++ b/README.md @@ -598,6 +598,50 @@ The buildpack includes a variety of telemetry agents that can be configured to c ### New Relic +#### Set up New Relic integration + +[Fluent Bit](https://docs.fluentbit.io/manual/) is used to collect Mendix Runtime logs to [New Relic](https://newrelic.com/). + +The metrics are collected by the New Relic Java Agent and an integration with Telegraf. The first collects some container and database metrics, while the second collects metrics related to the Mendix Runtime. + +To enable the integration you must provide the following variables: + +| Environment variable | Value example | Default | Description | +|-------------------------|------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------| +| `NEW_RELIC_LICENSE_KEY` | `api_key` | - | License Key or API Key ([docs](https://docs.newrelic.com/docs/apis/intro-apis/new-relic-api-keys/)) | +| `NEW_RELIC_METRICS_API` | `https://metric-api.eu.newrelic.com/metric/v1` | - | Metrics endpoint API ([docs](https://docs.newrelic.com/docs/data-apis/ingest-apis/metric-api/report-metrics-metric-api/#api-endpoint)) | +| `NEW_RELIC_LOGS_API` | `https://log-api.eu.newrelic.com/log/v1` | - | Logs endpoint API ([docs](https://docs.newrelic.com/docs/logs/log-api/introduction-log-api/)) | + +:warning: For the first usage of the New Relic integration, the Mendix app should be redeployed after setting the variables up. + +Custom tags +You can also set up custom tags in the following format key:value. We recommend that you add the following custom tags: + +app:{app_name} – this enables you to identify all logs sent from your app (for example, app:customermanagement) +env:{environment_name} – this enables you to identify logs sent from a particular environment so you can separate out production logs from test logs (for example, env:accp) + +#### Metadata (IN PROGRESS) + +In addition to the runtime application logs, the following JSON-formatted metadata is automatically sent to New Relic: + +* `environment_id` - unique identifier of the environment; +* `instance_index` - number of the application instance; +* `hostname` - name of the application host; +* `application_name` - default application name, retrieved from domain name; +* `model_version` - model version of the Mendix runtime; +* `runtime_version` - version of the Mendix runtime. + +You can filter the data by these fields on Splunk Cloud Platform web interface. + +#### Custom tags (IN PROGRESS) + +You can also set up custom tags in the following format `key:value`. We recommend that you add the following custom tags: + +* `app:{app_name}` – this enables you to identify all logs sent from your app (for example, **app:customermanagement**) +* `env:{environment_name}` – this enables you to identify logs sent from a particular environment so you can separate out production logs from test logs (for example, **env:accp**) + +#### Service-base integration (on-prem only) + To enable New Relic, simply bind a New Relic service to this app and settings will be picked up automatically. Afterwards you have to restage your application to enable the New Relic agent. ### Splunk diff --git a/buildpack/stage.py b/buildpack/stage.py index b46ab6ec..e29b0d5f 100755 --- a/buildpack/stage.py +++ b/buildpack/stage.py @@ -200,8 +200,8 @@ def cleanup_dependency_cache(cached_dir, dependency_list): appdynamics.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) dynatrace.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) splunk.stage() - fluentbit.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) newrelic.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) + fluentbit.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) mx_java_agent.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR, runtime_version) telegraf.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR, runtime_version) datadog.stage(BUILDPACK_DIR, DOT_LOCAL_LOCATION, CACHE_DIR) diff --git a/buildpack/telemetry/fluentbit.py b/buildpack/telemetry/fluentbit.py index c6a445d4..03ed3a61 100644 --- a/buildpack/telemetry/fluentbit.py +++ b/buildpack/telemetry/fluentbit.py @@ -3,11 +3,12 @@ import subprocess import shutil import socket +from typing import List import backoff from buildpack import util -from buildpack.telemetry import splunk +from buildpack.telemetry import newrelic, splunk NAMESPACE = "fluentbit" @@ -15,6 +16,7 @@ FILTER_FILENAMES = ("redaction.lua", "metadata.lua") FLUENTBIT_ENV_VARS = { "FLUENTBIT_LOGS_PORT": os.getenv("FLUENTBIT_LOGS_PORT", default="5170"), + "FLUENTBIT_DEBUG": os.getenv("FLUENTBIT_DEBUG", default="false"), } @@ -23,6 +25,19 @@ def _set_default_env(m2ee): util.upsert_custom_environment_variable(m2ee, var_name, value) +def _get_output_conf_filenames() -> List[str]: + """ + Determine the output configs to use. Only enabled integrations + will have the output file in the container. + """ + output_conf_files: List[str] = [] + if splunk.is_splunk_enabled(): + output_conf_files.append("output_splunk.conf") + if newrelic.is_enabled(): + output_conf_files.append("output_newrelic.conf") + return output_conf_files + + def stage(buildpack_dir, destination_path, cache_path): if not is_fluentbit_enabled(): @@ -36,7 +51,11 @@ def stage(buildpack_dir, destination_path, cache_path): cache_dir=cache_path, ) - for filename in (CONF_FILENAME, *FILTER_FILENAMES): + output_conf_files = _get_output_conf_filenames() + + for filename in ( + CONF_FILENAME, *FILTER_FILENAMES, *output_conf_files + ): shutil.copy( os.path.join(buildpack_dir, "etc", NAMESPACE, filename), os.path.join( @@ -82,10 +101,9 @@ def run(model_version, runtime_version): "fluent-bit", ) - fluentbit_config_path = os.path.join( - fluentbit_dir, - CONF_FILENAME, - ) + fluentbit_config_path = os.path.join(fluentbit_dir, CONF_FILENAME) + + fluentbit_log_file = _get_log_file() if not os.path.exists(fluentbit_bin_path): logging.warning( @@ -93,7 +111,8 @@ def run(model_version, runtime_version): "Please redeploy your application to complete " "Fluent Bit installation." ) - splunk.print_failed_message() + splunk.integration_setup(False) + newrelic.integration_setup(False) return agent_environment = _set_up_environment(model_version, runtime_version) @@ -101,7 +120,14 @@ def run(model_version, runtime_version): logging.info("Starting Fluent Bit...") subprocess.Popen( - (fluentbit_bin_path, "-c", fluentbit_config_path), env=agent_environment + ( + fluentbit_bin_path, + "-c", + fluentbit_config_path, + "-l", + fluentbit_log_file + ), + env=agent_environment, ) # The runtime does not handle a non-open logs endpoint socket @@ -113,24 +139,36 @@ def _await_logging_endpoint(): ) logging.info("Awaiting Fluent Bit log subscriber...") - if _await_logging_endpoint() == 0: + success = True + if _await_logging_endpoint() != 0: + success = False + + _integration_setup(success) + splunk.integration_setup(success) + newrelic.integration_setup(success) + + +def _integration_setup(success: bool) -> None: + """Call when the setup is done.""" + if success: logging.info("Fluent Bit log subscriber is ready.") - splunk.print_ready_message() else: logging.error( "Fluent Bit log subscriber was not initialized correctly." "Application logs will not be shipped to Fluent Bit." ) - splunk.print_failed_message() def _set_up_environment(model_version, runtime_version): env_vars = dict(os.environ.copy()) - env_vars["SPLUNK_APP_HOSTNAME"] = util.get_hostname() - env_vars["SPLUNK_APP_NAME"] = util.get_app_from_domain() - env_vars["SPLUNK_APP_RUNTIME_VERSION"] = str(runtime_version) - env_vars["SPLUNK_APP_MODEL_VERSION"] = model_version + # TODO: Improve this - it should be on the envvars already + env_vars["FLUENTBIT_LOGS_PORT"] = "5170" + + env_vars["FLUENTBIT_APP_HOSTNAME"] = util.get_hostname() + env_vars["FLUENTBIT_APP_NAME"] = util.get_app_from_domain() + env_vars["FLUENTBIT_APP_RUNTIME_VERSION"] = str(runtime_version) + env_vars["FLUENTBIT_APP_MODEL_VERSION"] = model_version return env_vars @@ -139,9 +177,18 @@ def is_fluentbit_enabled(): """ The function checks if some modules which requires Fluent Bit is configured. - """ return any( - [splunk.is_splunk_enabled()] + [splunk.is_splunk_enabled(), newrelic.is_enabled()] ) # Add other modules, where Fluent Bit is used + + +def _get_log_file() -> str: + """Discard logs unless debug is active.""" + # FluentBit currently does not support log rotation, + # so the file must only be used when debugging + fluentbit_debug = os.getenv("FLUENTBIT_DEBUG", "false").lower() + if fluentbit_debug == "true": + return "/app/log/fluentbit.log" + return "/dev/null" diff --git a/buildpack/telemetry/metrics.py b/buildpack/telemetry/metrics.py index 9828a1a9..d8bc8268 100644 --- a/buildpack/telemetry/metrics.py +++ b/buildpack/telemetry/metrics.py @@ -18,7 +18,7 @@ from lib.m2ee.version import MXVersion from lib.m2ee.util import strtobool -from . import datadog, appdynamics, dynatrace +from . import appdynamics, datadog, dynatrace, newrelic METRICS_REGISTRIES_KEY = "Metrics.Registries" @@ -136,6 +136,7 @@ def configure_metrics_registry(m2ee): or get_appmetrics_target() or appdynamics.machine_agent_enabled() or dynatrace.is_telegraf_enabled() + or newrelic.is_enabled() ): allow_list, deny_list = get_apm_filters() paidapps_registries.append(get_statsd_registry(allow_list, deny_list)) diff --git a/buildpack/telemetry/newrelic.py b/buildpack/telemetry/newrelic.py index c7a95d05..f01f04ba 100644 --- a/buildpack/telemetry/newrelic.py +++ b/buildpack/telemetry/newrelic.py @@ -1,11 +1,16 @@ import logging import os +from typing import Dict from buildpack import util NAMESPACE = "newrelic" ROOT_DIR = ".local" +REQUIRED_NEW_RELIC_ENV_VARS = [ + "NEW_RELIC_LICENSE_KEY", "NEW_RELIC_LOGS_API", "NEW_RELIC_METRICS_API" +] + def stage(buildpack_dir, install_path, cache_path): if get_new_relic_license_key(): @@ -44,7 +49,47 @@ def update_config(m2ee, app_name): def get_new_relic_license_key(): + """Get the New Relic's license key.""" + # Service-binding based integration (on-prem only) vcap_services = util.get_vcap_services_data() if vcap_services and "newrelic" in vcap_services: return vcap_services["newrelic"][0]["credentials"]["licenseKey"] - return None + + return os.environ.get("NEW_RELIC_LICENSE_KEY", None) + + +def is_enabled() -> bool: + """ + The function checks if all environment variables required + for New Relic connection are set up. The service-binding + based integration (on-prem only) does not care about this. + """ + return all(map(os.getenv, REQUIRED_NEW_RELIC_ENV_VARS)) + + +def get_metrics_config() -> Dict: + """Configs to be used by telegraf.""" + return { + "api_key": os.getenv("NEW_RELIC_LICENSE_KEY"), + "metrics_base_url": os.getenv("NEW_RELIC_METRICS_API"), + } + + +def get_metrics_tags(app_name) -> Dict: + """Tags to be used by telegraf.""" + return { + "app": util.get_app_from_domain(), + "instance_index": int(os.getenv("CF_INSTANCE_INDEX", "0")), + "app_name": app_name + } + + +def integration_setup(success: bool) -> None: + """Call when the setup is done.""" + if not is_enabled(): + return + + if success: + logging.info("New Relic has been configured successfully.") + else: + logging.error("Failed to configure New Relic.") diff --git a/buildpack/telemetry/splunk.py b/buildpack/telemetry/splunk.py index b6774fc8..4a501025 100644 --- a/buildpack/telemetry/splunk.py +++ b/buildpack/telemetry/splunk.py @@ -31,30 +31,18 @@ def update_config(m2ee): _set_default_env(m2ee) -def print_ready_message(): +def integration_setup(success: bool) -> None: """ This function can be called from external module. - For example: fluentbit.py calls this function when Fluent Bit is ready. - - """ - - if not is_splunk_enabled(): - return - - logging.info("Splunk has been configured successfully.") - - -def print_failed_message(): - """ - This function can be called from external module. - For example: fluentbit.py calls this function when Fluent Bit is failed. - + For example: fluentbit.py calls this function when Fluent Bit is done. """ - if not is_splunk_enabled(): return - logging.error("Failed to configure Splunk.") + if success: + logging.info("Splunk has been configured successfully.") + else: + logging.error("Failed to configure Splunk.") def is_splunk_enabled(): diff --git a/buildpack/telemetry/telegraf.py b/buildpack/telemetry/telegraf.py index 30860372..dcc38299 100644 --- a/buildpack/telemetry/telegraf.py +++ b/buildpack/telemetry/telegraf.py @@ -18,7 +18,7 @@ from lib.m2ee.util import strtobool from jinja2 import Template -from . import datadog, metrics, mx_java_agent, appdynamics, dynatrace, splunk +from . import appdynamics, datadog, dynatrace, metrics, mx_java_agent, newrelic, splunk NAMESPACE = "telegraf" DEPENDENCY = f"{NAMESPACE}.agent" @@ -89,6 +89,7 @@ def include_db_metrics(): or datadog.is_enabled() or appdynamics.machine_agent_enabled() or dynatrace.is_telegraf_enabled() + or newrelic.is_enabled() ): # For customers who have Datadog or AppDynamics or APPMETRICS_TARGET enabled, # we always include the database metrics. They can opt out @@ -109,6 +110,7 @@ def is_enabled(runtime_version): or appdynamics.machine_agent_enabled() or dynatrace.is_telegraf_enabled() or metrics.micrometer_metrics_enabled(runtime_version) + or newrelic.is_enabled() ) @@ -231,6 +233,7 @@ def _get_integration_usages(): "dynatrace": dynatrace.is_telegraf_enabled, "appdynamics": appdynamics.appdynamics_used, "splunk": splunk.is_splunk_enabled, + "newrelic": newrelic.is_enabled, } for integration, is_enabled in checker_methods.items(): @@ -255,10 +258,19 @@ def update_config(m2ee, app_name): template_path = os.path.join(_get_config_file_dir(version), TEMPLATE_FILENAME) tags = util.get_tags() + if datadog.is_enabled() and "service" not in tags: # app and / or service tag not set tags["service"] = datadog.get_service_tag() + # Add application tags to the custom metrics sent to New Relic + if newrelic.is_enabled(): + newrelic_tags = newrelic.get_metrics_tags(app_name) + + # Make sure the user defined values persist, if the tags overlap + newrelic_tags.update(tags) + tags = newrelic_tags + dynatrace_token, dynatrace_ingest_url = dynatrace.get_ingestion_info() with open(template_path, "r") as file_: @@ -284,6 +296,8 @@ def update_config(m2ee, app_name): appdynamics_output_script_path=APPDYNAMICS_OUTPUT_SCRIPT_PATH, dynatrace_enabled=dynatrace.is_telegraf_enabled(), dynatrace_config=_get_dynatrace_config(app_name), + newrelic_enabled=newrelic.is_enabled(), + newrelic_config=newrelic.get_metrics_config(), telegraf_debug_enabled=os.getenv("TELEGRAF_DEBUG_ENABLED", "false"), telegraf_fileout_enabled=strtobool( os.getenv("TELEGRAF_FILEOUT_ENABLED", "false") diff --git a/etc/fluentbit/fluentbit.conf b/etc/fluentbit/fluentbit.conf index 467e838b..aad1ba2f 100644 --- a/etc/fluentbit/fluentbit.conf +++ b/etc/fluentbit/fluentbit.conf @@ -1,8 +1,8 @@ [INPUT] - Name tcp - Listen localhost - Port ${FLUENTBIT_LOGS_PORT} - Format json + Name tcp + Listen localhost + Port ${FLUENTBIT_LOGS_PORT} + Format json [FILTER] Name lua @@ -16,12 +16,5 @@ script metadata.lua call add_metadata -[OUTPUT] - # SPLUNK cloud platform - Name splunk - Match * - Host ${SPLUNK_HOST} - Port ${SPLUNK_PORT} - Splunk_Token ${SPLUNK_TOKEN} - TLS On - TLS.Verify Off +# Only imports outputs from enabled integrations +@INCLUDE output_*.conf diff --git a/etc/fluentbit/metadata.lua b/etc/fluentbit/metadata.lua index e0496e58..bb4c62ea 100644 --- a/etc/fluentbit/metadata.lua +++ b/etc/fluentbit/metadata.lua @@ -2,10 +2,11 @@ function add_metadata(tag, timestamp, record) record["instance_index"] = os.getenv("CF_INSTANCE_INDEX") or "" record["environment_id"] = os.getenv("ENVIRONMENT") or "" - record["hostname"] = os.getenv("SPLUNK_APP_HOSTNAME") or "" - record["application_name"] = os.getenv("SPLUNK_APP_NAME") or "" - record["runtime_version"] = os.getenv("SPLUNK_APP_RUNTIME_VERSION") or "" - record["model_version"] = os.getenv("SPLUNK_APP_MODEL_VERSION") or "" + + record["hostname"] = os.getenv("FLUENTBIT_APP_HOSTNAME") or "" + record["application_name"] = os.getenv("FLUENTBIT_APP_NAME") or "" + record["runtime_version"] = os.getenv("FLUENTBIT_APP_RUNTIME_VERSION") or "" + record["model_version"] = os.getenv("FLUENTBIT_APP_MODEL_VERSION") or "" local raw_tags = os.getenv("TAGS") if raw_tags then diff --git a/etc/fluentbit/output_newrelic.conf b/etc/fluentbit/output_newrelic.conf new file mode 100644 index 00000000..639da2bf --- /dev/null +++ b/etc/fluentbit/output_newrelic.conf @@ -0,0 +1,5 @@ +[OUTPUT] + name nrlogs + match * + base_uri ${NEW_RELIC_LOGS_API} + api_key ${NEW_RELIC_LICENSE_KEY} diff --git a/etc/fluentbit/output_splunk.conf b/etc/fluentbit/output_splunk.conf new file mode 100644 index 00000000..21d8cb8a --- /dev/null +++ b/etc/fluentbit/output_splunk.conf @@ -0,0 +1,9 @@ +[OUTPUT] + # SPLUNK cloud platform + Name splunk + Match * + Host ${SPLUNK_HOST} + Port ${SPLUNK_PORT} + Splunk_Token ${SPLUNK_TOKEN} + TLS On + TLS.Verify Off diff --git a/etc/telegraf/telegraf.toml.j2 b/etc/telegraf/telegraf.toml.j2 index 5b4f04b7..b3926e7a 100644 --- a/etc/telegraf/telegraf.toml.j2 +++ b/etc/telegraf/telegraf.toml.j2 @@ -70,7 +70,7 @@ {% endif %} {% if db_config %} -{% if not (datadog_api_key or appdynamics_enabled or dynatrace_enabled) %} +{% if not (datadog_api_key or appdynamics_enabled or dynatrace_enabled or newrelic_enabled) %} # PostgreSQL input (standard) [[inputs.postgresql]] address = "postgres://{{ db_config['DatabaseUserName'] }}:{{ db_config['DatabasePassword'] }}@{{ db_config['DatabaseHost'] }}/{{ db_config['DatabaseName'] }}" @@ -355,6 +355,20 @@ {% endfor %} {% endif %} +{% if newrelic_enabled %} +[[outputs.newrelic]] + metric_url = "{{ newrelic_config['metrics_base_url'] }}" + insights_key = "{{ newrelic_config['api_key'] }}" + + # we don’t need to send this as a field. The plugin will send a proper timestamp via the Metrics API. + # fielddrop = ["timestamp"] + + # Ignore any micrometer_metrics + [outputs.newrelic.tagdrop] + micrometer_metrics = ["true"] + internal_metrics = ["true"] +{% endif %} + {% if micrometer_metrics %} #################################################################################### # App metrics via micrometer # @@ -394,7 +408,7 @@ data_format = "json" json_timestamp_units = "1ns" - # tagexlude drops any non-relevant tags + # tagexclude drops any non-relevant tags tagexclude = ["host"] # Drop `mx_runtime_user_login` metrics @@ -476,7 +490,7 @@ ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md data_format = "influx" - # tagexlude drops any non-relevant tags + # tagexclude drops any non-relevant tags tagexclude = ["host"] # Drop `mx_runtime_user_login` metrics diff --git a/requirements.txt b/requirements.txt index fd624ec3..73a9487c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --resolver=backtracking requirements.in diff --git a/tests/integration/test_newrelic.py b/tests/integration/test_newrelic.py new file mode 100644 index 00000000..7687e78a --- /dev/null +++ b/tests/integration/test_newrelic.py @@ -0,0 +1,51 @@ +from tests.integration import basetest + + +class TestCaseDeployWithNewRelic(basetest.BaseTest): + def _deploy_app(self, mda_file, newrelic=True): + super().setUp() + + env_vars = {} + + if newrelic: + env_vars["NEW_RELIC_LICENSE_KEY"] = "dummy_token" + env_vars["NEW_RELIC_METRICS_API"] = "metrics_api" + env_vars["NEW_RELIC_LOGS_API"] = "logs_api" + + self.stage_container(mda_file, env_vars=env_vars) + self.start_container() + + def _test_fluentbit_running(self, mda_file): + self._deploy_app(mda_file) + self.assert_app_running() + + # check if Fluentbit is running + output = self.run_on_container("ps -ef| grep fluentbit") + assert output is not None + assert str(output).find("fluent-bit") >= 0 + + def _test_fluentbit_not_running(self, mda_file): + self._deploy_app(mda_file, newrelic=False) + self.assert_app_running() + + # check if Fluentbit is not running + output = self.run_on_container("ps -ef| grep fluentbit") + assert str(output).find("fluent-bit") == -1 + + def _test_newrelic_is_configured(self): + self.assert_string_in_recent_logs( + "New Relic has been configured successfully." + ) + + def _test_newrelic_is_not_configured(self): + self.assert_patterns_not_in_recent_logs( + "Failed to configure New Relic." + ) + + def test_newrelic_mx9(self): + self._test_fluentbit_running("BuildpackTestApp-mx9-7.mda") + self._test_newrelic_is_configured() + + def test_newrelic_not_configured(self): + self._test_fluentbit_not_running("BuildpackTestApp-mx9-7.mda") + self._test_newrelic_is_not_configured()