From 21dc3af202924d1f396c43435702f8e9dade09bf Mon Sep 17 00:00:00 2001 From: Gibbs Cullen Date: Mon, 27 Apr 2020 12:55:20 -0400 Subject: [PATCH] content trasferred to hugo site --- docs-beta/content/about_m3/_index.md | 8 +- docs-beta/content/about_m3/contributing.md | 177 ++++++++++- docs-beta/content/about_m3/glossary.md | 18 +- docs-beta/content/about_m3/release_notes.md | 2 +- docs-beta/content/getting_started/_index.md | 13 +- docs-beta/content/getting_started/docker.md | 42 ++- docs-beta/content/getting_started/kube.md | 209 ++++++++++++- .../content/getting_started/m3_binary.md | 154 +++++++++- docs-beta/content/how_to_guides/_index.md | 7 +- .../how_to_guides/aggregation/_index.md | 34 +++ .../how_to_guides/aggregation/namespace.md | 2 +- .../aggregation/rules_policies.md | 48 +++ docs-beta/content/how_to_guides/grafana.md | 31 +- docs-beta/content/how_to_guides/graphite.md | 108 ++++++- .../content/how_to_guides/monitoring_m3.md | 6 - .../how_to_guides/monitoring_m3/tracing.md | 41 +++ docs-beta/content/how_to_guides/other/sql.md | 2 +- docs-beta/content/how_to_guides/other/tsdb.md | 122 +++++++- .../content/how_to_guides/other/upgrading.md | 64 +++- docs-beta/content/how_to_guides/prometheus.md | 111 ++++++- docs-beta/content/intro_m3/_index.md | 37 +++ docs-beta/content/intro_m3/overview.md | 13 - docs-beta/content/media/_index.md | 6 +- .../content/operational_guides/_index.md | 6 +- .../operational_guides/managing_aggregator.md | 2 +- .../operational_guides/managing_m3db.md | 6 - .../managing_m3db/_index.md | 99 +++++- .../operational_guides/managing_m3db/etcd.md | 62 ++++ .../managing_m3db/m3db_node_mgmt.md | 127 +++++++- .../managing_m3db/namespace_mgmt.md | 4 +- .../operational_guides/managing_query.md | 282 +++++++++++++++++- docs-beta/content/quickstart/_index.md | 107 ++++++- docs-beta/content/reference_docs/_index.md | 6 +- docs-beta/content/reference_docs/apis.md | 180 ++++++++++- .../content/reference_docs/architecture.md | 6 - .../reference_docs/architecture/aggregator.md | 2 +- .../architecture/coordinator.md | 2 +- .../reference_docs/architecture/m3db.md | 41 ++- .../reference_docs/architecture/query.md | 50 +++- .../content/reference_docs/configurations.md | 6 - .../configurations/annotated_config.md | 2 +- .../configurations/availability.md | 83 +++++- .../configurations/bootstrapping.md | 97 ++++++ .../configurations/namespace_config.md | 120 +++++++- .../configurations/replication.md | 162 ++++++++++ .../configurations/topology_config.md | 79 ++++- docs-beta/content/reference_docs/operator.md | 2 +- docs-beta/content/troubleshooting/_index.md | 4 +- .../content/troubleshooting/error_msgs.md | 50 +++- docs-beta/content/troubleshooting/faqs.md | 33 +- .../content/troubleshooting/file_issue.md | 2 +- .../content/troubleshooting/resources.md | 2 +- docs-beta/content/versions/_index.md | 9 +- 53 files changed, 2785 insertions(+), 103 deletions(-) delete mode 100644 docs-beta/content/how_to_guides/monitoring_m3.md create mode 100644 docs-beta/content/intro_m3/_index.md delete mode 100644 docs-beta/content/intro_m3/overview.md delete mode 100644 docs-beta/content/operational_guides/managing_m3db.md delete mode 100644 docs-beta/content/reference_docs/architecture.md delete mode 100644 docs-beta/content/reference_docs/configurations.md diff --git a/docs-beta/content/about_m3/_index.md b/docs-beta/content/about_m3/_index.md index fd48bf221d..ba873ed56c 100644 --- a/docs-beta/content/about_m3/_index.md +++ b/docs-beta/content/about_m3/_index.md @@ -1,11 +1,13 @@ +++ title = "About the project" date = 2020-04-01T19:43:46-04:00 -weight = 1 +weight = 2 chapter = true -pre = "1. " +pre = "2. " +++ ### Section 1 -# Overview \ No newline at end of file +# Overview + + diff --git a/docs-beta/content/about_m3/contributing.md b/docs-beta/content/about_m3/contributing.md index 609df3a10e..1db5a7ebee 100644 --- a/docs-beta/content/about_m3/contributing.md +++ b/docs-beta/content/about_m3/contributing.md @@ -1,6 +1,181 @@ --- -title: "Contributing" +title: "III. Contributing" date: 2020-04-21T20:45:23-04:00 draft: true --- + + +Tips For Writing Documentation +Writing is easy. Writing well is hard. Writing documentation is even harder and, as one might expect, writing good documentation can be very hard. The challenge ultimately stems from the fact that good documentation requires attention to several things: consistency, clarity, detail, information density, unambiguity, and grammar (of course). +This page describes tips for writing documentation (for M3 or otherwise). There are many different, equally-valid writing styles; this document isn't meant as a prescription for how to write, but instead as optional (but encouraged) guidelines geared toward writing consistent, clear, easy-to-read documentation. +Tip #1: Avoid first- and second-person pronouns. +Documentation should always be written from a third-person point of view (specifically, with a third-person objective narrative voice); avoid using first- and second-person. The point of documentation is not to be a message from the author or to convey their point of view, but rather to be objective, factual, and canonical. +Examples + + +BAD +You will need to set Value X to "foo", as our testing indicated that "bar" can cause issues when solar rays are present. +BAD +We suggest setting Value X to "foo", as testing indicated that "bar" can cause issues when solar rays are present. +GOOD +Value X must not be set to "bar", as testing indicated that it may cause issues when solar rays are present. +GOOD +Value X must be set to "foo"; a value of "bar" may cause issues when solar rays are present. +Tip #2: Avoid subjective language. +Subjective language – language that is open to interpretation, particularly based on perspective – should be avoided wherever possible. Documentation should state objective or broadly empirical truths (and should be supported by facts). Introducing perspective inherently creates ambiguity and vagueness in the document, as its meaning becomes reader-dependent. Importantly, cases of subjective language are distinct from cases of conditional behavior. +Examples + + +BAD +Setting PowerLevel to 9000 is required, because efficiency is better than reliability in high-rate situations. +GOOD +In high-rate situations (≥1k req/s), PowerLevel should be set to 9000 in order to maximize efficiency; otherwise, the system defaults to ensuring reliability. +Tip #3: Focus on concrete ideas. +It can be easy to get sidetracked within documentation, or to bury critical points amidst tangent, ancillary, or otherwise extraneous information. When writing documentation, always ask, "What am I trying to say?". Using terse bullets can help structure information, and using a self-defined guideline for sentence (or paragraph) length can help to optimize the message such that the information density is high (i.e., no fluff or filler). +Tip #4: Use concise sentences (or fragments). +Sentences and sentence fragments should be simple and to the point (but not stuttering). Each sentence should ideally convey a single idea, but clearly-structured compound sentences are acceptable as well. +Examples + + +BAD +In order to time travel the MaxSpeed setting must be set to 88MPH because lower speeds will not achieve the desired result, but users should be careful around turns as high speeds are dangerous. +BAD +Time travel requires MaxSpeed to be set. The ideal value is 88MPH. Lower speeds will not achieve the desired result. Users should be careful around turns as high speeds are dangerous. +GOOD +Setting MaxSpeed to 88MPH is required to enable time travel – lower speeds will not achieve the desired result. However, users should take care around turns, as high speeds are dangerous. +Tip #5: Write down questions that the documentation should answer. +The point of writing documentation is to document the subject and educate the reader, but every piece of documentation should be written with an understanding of which questions it is answering. Questions can be as specific as "How do I deploy service X?", or as vague as "How does the system work?", but for vaguer questions, there are ostensibly sub-questions (or skeleton bullets) that help to flesh them out. +These questions can be included as a preface to the documentation if desired, which can serve to set the reader's expectations and frame of reference (both of which can help to improve comprehension and retention). +A contrived example: +After reading this document, users should be able to answer: + + - What does the system's topology look like? + - What are components X, Y, and Z, and how are they related? + - When should one use Modes 1-3 of component X? + - How can component Y be deployed in a HA capacity? + - How can components X and Y be tuned to mitigate load on component Z? + - How can all of the components be configured for correctness versus availability? + +Tip #6: Write skeleton documentation first. +Starting with a skeleton shows the full breadth and depth of a document (or set of documents) before anything is written, enabling a more holistic view of the content and helping to ensure cohesion and flow. Conversely, by fleshing out each section as it's added, it can be easy to lose track of the scope of the section relative to the document, or the scope of the document itself. This approach also serves as a TODO list, helping to ensure that sections aren't forgotten about and enabling the documentation to be divided and conquered. +A contrived example: +# The Document Title + +## Overview + + - One-sentence or elevator pitch summary + - Introduce components X, Y, and Z + +## Component X + +### Overview + - Responsible for all client ingress + - Sole entrypoint into the system + - Can be deployed to support multiple client encodings + - Adapter layer for translating all ingress from clients into + - Traffic goes: Client -> Component X -> Component Y -> Component Z + +### Deployment modes + - HA + - Lossless + - High-throughput + +## Component Y + +### Overview + - Dynamic, stateless, full-mesh cluster topology + - Deduplicates all ingress from Component X + - Partitions data among Component Z backends using perfect hashing + +### Deployment modes + - Dynamic cluster + - Static cluster + - Full mesh + - Ring network + - Isolated + +## Component Z + +### Overview + - Persists normalized client data + - Maintains a sharded, in-memory cache + +### Deployment modes + - HA + - Prioritized throughput + - Writethrough cache + - Readthrough cache + - Ring buffer + - LRU + +Tip #7: When in doubt, less is more. +While it may seem counterintuitive to start by writing less in a given document, terseness – specifically, fewer sentences or words to explain an idea – is easier to fix than verboseness. It is far easier to find gaps in a reader's comprehension that are caused by missing documentation than it is to try and find the minimal subset of documentation necessary to convey an idea. In other words: start short, and build out. +Examples + + +BAD +Configuring the cache to be a LRU readthrough cache ensures that the minimal amount of memory is used at the expense of guaranteed initial cache misses. Using minimal memory is important, as servers have finite memory that does not scale with the size of persisted data, and thus the bulk of the data will incur cache misses after a certain point (though compression may be used to mitigate this in the future). +GOOD +Configuring the cache as a LRU readthrough cache results in a minimal memory footprint (due to the combined behavior of both readthrough and LRU caching) at the expense of initial cache misses. Cache efficiency may be improved in the future. +Tip #8: Write for an uninitiated target audience (and call out prerequisites). +Every piece of technical documentation has at least one baseline prerequisite that the documentation builds upon; otherwise, every document would need to explain how computers work. However, in the case of M3 (as an example), the reader might be someone writing code that emits metrics, an infrastructure engineer that wants to run a metrics stack, someone with experience running other metrics stacks that is looking for nuanced tradeoffs between this and other platforms, or a distributed systems enthusiast (or a thousand other folks). +It's relatively straightforward to tailor a document to a given audience: an end-user looking to emit metrics is a much different demographic than someone wanting to run M3 in their infrastructure (etc), and as such, there is often no conflation. However, the skill level or technical depth of a given audience is often varied: a person who only vaguely understands what a metric really is will require a much broader set of information than a person who has experience using metrics in complex and interesting ways. +Thus, it's important to cater to a baseline: don't expect that every piece of technical jargon will be understood. Similarly, it's not a document's responsibility to satisfy its own prerequisites - it's okay to inform the reader that they should be familiar with X, Y, and Z, else they won't get much out of the document. If there are prerequisites that normalize the baseline of most or all readers, call that out at the beginning (e.g., "Users should read and understand Document X before reading this document"). +Tip #9: Make sure that documentation is reviewed thoroughly. +Erroneous documentation can lead to confusion, misunderstanding, incorrect assumptions, or – in the worst case – a user unknowingly implementing bad or faulty behavior. Documentation should ideally be reviewed like code: thoroughly and pedantically. Depending on the complexity, it may be worth asking reviewers to answer basic questions and update the documentation based on their responses – for example, if the question "When should ConfigValueX be set, and what are its side effects?" can't be answered, it might merit adding more documentation around the semantics of ConfigValueX, as well as its direct and side effects. +Tip #10: Have at least one less-familiar person review documentation. +It's extremely easy to let assumptions, implicit points, jargon, etc sneak in. Folks who are familiar with a subject are more likely to subconsciously or contextually fill in the gaps, but they are not the documentation's intended audience. Instead, try to find one or more folks who are closer to the target audience (relatively speaking) review documentation, and evaluate their comprehension (e.g., ask basic questions that the documentation should provide answers for). +Reviewers should sanity check all assumptions, and ask any questions freely – it's important to not assume that they just missed the boat on some critical information that is obvious to everyone else (most documentation should be explanatory and not expect readers to continually read between the lines). +Tip #11: Document both reality and intent. +While documentation should reflect the current state of the world, in many cases, it's also important to document the context or rationale around that state (the "what" and the "why"). Documenting only the "what" leaves readers to draw their own inferences and conclusions, while the "why" answers many of those questions outright (or, at the least, imparts a line of thinking that will more accurately inform those inferences). +Tip #12: Focus on readability. +The goal of documentation is to provide information to the reader. This means that, aside from correctness, the most important metrics of documentation are its efficiency (words per idea) and efficacy (readability of words and retention of ideas). +Reading a sentence in isolation helps to analyze the readability of an idea in a narrow context, but reading multiple sentences (or a paragraph) at a time can be a good signal of how well information flows. +Tip #13: Be pedantic about grammar. +English grammar can often be flexible, subjectively readable, or confusing (there is more than one way to skin a cat). Being clear and using simpler, well-structured sentences can help with readability by presenting digestible, unambiguous ideas. Verb tense and pronouns can play an important role as well. Ultimately, the best litmus test may be the simplest: whether a reader needs to read a sentence twice to understand it (technical terms notwithstanding). +Realistically, it is impractical to strive for or expect perfect grammar. The goal isn't to write flawless English; the goal is to write clear, concise, unambiguous, and readable documentation, for which grammar is an important (albeit not the only) tool. +Tip #14: Use verb tenses consistently. +Generally speaking, the bulk of documentation only makes sense when written in the present tense (its "common tense", so to speak), as objects are typically referred to in the abstract (e.g. "Component X" is abstract, "The running instance of Component X" proverbially concrete) and do not usually involve time (e.g. "Component X is an upstream of Component Y" refers to nominal abstract state, "Component X will be an upstream of Component Y" is prospective state). Of course, this is not always true: documentation can tell a story, and tenses should be used as needed to express ideas relative to time. +However, cases of general fact, such as describing cause and effect: +Value X -> Behavior Y + +should be described in the present tense: +Setting Value X causes Behavior Y. + +versus the future tense: +Setting Value X will cause Behavior Y. + +in order to be consistent with the documentation's common tense, to maintain a time-less (and thus ongoing) objective truth, and to contrast actual time-relative statements (e.g. "Setting Value X causes Behavior Y, but will be updated to cause Behavior Z in the future"). +Tip #15: Make consistent references. +When communicating verbally, there is no need to distinguish between represented forms for a given artifact, value, method, etc - one can simply say "foo bar" to reference "the member function or property Bar on type Foo", or "foo dot bar" to reference "the YAML key bar nested under the key foo" (as simple examples). +In written documentation, however, these references can either maintain continuity throughout a document (when used consistently), or they can cause confusion (when used inconsistently). A simple framework for references is, "if it's code (or code-like), it should look like code; otherwise, treat it like a proper noun". Thankfully, most references fall into two such categories: +Code (and code-like) references, such as MyClass or myVar or some.config.property. In these cases, PreformattedText should be used, and should reference the code verbatim. All instances of such a reference should be consistent, and should not be abbreviated. +Proper (or effectively proper) names, such as "My Component" or "My System Name". In these cases, "Title Case" should be used, with the following exceptions: +Established projects, such as "etcd", "PostgreSQL", and "GitHub", should use the project's official name and format, versus arbitrary ones (e.g. "Etcd", "Postgres", "Github", etc). +First-party components, such as "the M3 Aggregator", can use shorthand when doing so does not contextually result in ambiguity (e.g. "the Aggregator", when already discussing the Aggregator), but should still be proper nouns. +For other or nuanced cases, it's more important to be consistent than it is to fit a reference into one of these simple categories. +Examples + + +BAD +The Flux Capactitor is a Y-shaped component that enables time travel. Importantly, the enclosing vehicle must be capable of providing 1.21 gigawatts of power, as the flux capacitor requires this energy to function. +BAD +FluxCapacitor's power level property should be set to 1.21. +GOOD +The Flux Capacitor is a Y-shaped component that enables time travel. Importantly, the enclosing vehicle must be capable of providing 1.21 gigawatts of power, as the Flux Capacitor requires this energy to function. +GOOD +FluxCapacitor's powerLevel property should be set to 1.21. +Tip #16: Use TODO/TODOC placeholders liberally. +Often, it might not yet be the right time to write a part of the documentation – either because a feature isn't ready yet, it's not the most practical time investment at present, or other reasons. In these cases, consider adding a TODO (in exactly the same way that code TODOs are used). If it's necessary/helpful to disambiguate code TODOs from documentation TODOs, TODOC can be used instead: it's similar enough to match a naive TODO search, but both TODO and TODOC can be reasoned about independently as desired. For example: +// FluxCapacitor TODOC(anyone) +type FluxCapacitor struct { + // TODO(anyone): use a discrete GigaWatts type instead of a float + RequiredPower float64 +} + +$ grep -rnH 'TODOC(' . +./time/timetravel/flux_capacitor.go:123: // FluxCapacitor TODOC(anyone) + +$ grep -rnH 'TODO(' . +./time/timetravel/flux_capacitor.go:125: // TODO(anyone): use a discrete Gi diff --git a/docs-beta/content/about_m3/glossary.md b/docs-beta/content/about_m3/glossary.md index ff081ce944..cef866e1f3 100644 --- a/docs-beta/content/about_m3/glossary.md +++ b/docs-beta/content/about_m3/glossary.md @@ -1,6 +1,22 @@ --- -title: "Glossary" +title: "I. Glossary" date: 2020-04-21T20:45:40-04:00 draft: true --- +Glossary +Bootstrapping: Process by which an M3DB node is brought up. Bootstrapping consists of determining the integrity of data that the node has, replay writes from the commit log, and/or stream missing data from its peers. +Cardinality: The number of unique metrics within the M3DB index. Cardinality increases with the number of unique tag/value combinations that are being emitted. +Datapoint: A single timestamp/value. Timeseries are composed of multiple datapoints and a series of tag/value pairs. +Labels: Pairs of descriptive words that give meaning to a metric. Tags and Labels are interchangeable terms. +Metric: A collection of uniquely identifiable tags. +M3: Highly scalable, distributed metrics platform that is comprised of a native, distributed time series database, a highly-dynamic and performant aggregation service, a query engine, and other supporting infrastructure. +M3Coordinator: A service within M3 that coordinates reads and writes between upstream systems, such as Prometheus, and downstream systems, such as M3DB. +M3DB: Distributed time series database influenced by Gorilla and Cassandra released as open source by Uber Technologies. +M3Query: A distributed query engine for M3DB. Unlike M3Coordinator, M3Query only provides supports for reads. +Namespace: Similar to a table in other types of databases, namespaces in M3DB have a unique name and a set of configuration options, such as data retention and block size. +Placement: Map of the M3DB cluster's shard replicas to nodes. Each M3DB cluster has only one placement. Placement and Topology are interchangeable terms. +Shard: Effectively the same as a "virtual shard" in Cassandra in that it provides an arbitrary distribution of time series data via a simple hash of the series ID. +Tags: Pairs of descriptive words that give meaning to a metric. Tags and Labels are interchangeable terms. +Timeseries: A series of data points tracking a particular metric over time. +Topology: Map of the M3DB cluster's shard replicas to nodes. Each M3DB cluster has only one placement. Placement and Topology are interchangeable terms. diff --git a/docs-beta/content/about_m3/release_notes.md b/docs-beta/content/about_m3/release_notes.md index 93bf0a4a86..d528e35c3b 100644 --- a/docs-beta/content/about_m3/release_notes.md +++ b/docs-beta/content/about_m3/release_notes.md @@ -1,5 +1,5 @@ --- -title: "Release_notes" +title: "II. Release notes" date: 2020-04-21T20:45:33-04:00 draft: true --- diff --git a/docs-beta/content/getting_started/_index.md b/docs-beta/content/getting_started/_index.md index 6672b1d317..94e40d019c 100644 --- a/docs-beta/content/getting_started/_index.md +++ b/docs-beta/content/getting_started/_index.md @@ -1,13 +1,14 @@ +++ title = "Getting Started" date = 2020-04-01T19:26:56-04:00 -weight = 2 +weight = 4 chapter = true -pre = "2. " +pre = "4. " +++ -### Section 2 +### Getting Started -# Getting Started - - \ No newline at end of file + Getting started with M3 is as easy as following one of the How-To guides: + * Getting started from the M3 Binary + * Getting started in Kubernetes + * Geting started in Docker \ No newline at end of file diff --git a/docs-beta/content/getting_started/docker.md b/docs-beta/content/getting_started/docker.md index 2ce1f044dc..d69c0209b0 100644 --- a/docs-beta/content/getting_started/docker.md +++ b/docs-beta/content/getting_started/docker.md @@ -1,6 +1,46 @@ --- -title: "Docker" +title: "In Docker" date: 2020-04-21T20:47:48-04:00 draft: true --- +Docker & Kernel Configuration +This document lists the Kernel tweaks M3DB needs to run well. If you are running on Kubernetes, you may use our sysctl-setter DaemonSet that will set these values for you. Please read the comment in that manifest to understand the implications of applying it. +Running with Docker +When running M3DB inside Docker, it is recommended to add the SYS_RESOURCE capability to the container (using the --cap-add argument to docker run) so that it can raise its file limits: +docker run --cap-add SYS_RESOURCE quay.io/m3/m3dbnode:latest + +If M3DB is being run as a non-root user, M3's setcap images are required: +docker run --cap-add SYS_RESOURCE -u 1000:1000 quay.io/m3/m3dbnode:latest-setcap + +More information on Docker's capability settings can be found here. +vm.max_map_count +M3DB uses a lot of mmap-ed files for performance, as a result, you might need to bump vm.max_map_count. We suggest setting this value to 3000000, so you don’t have to come back and debug issues later. +On Linux, you can increase the limits by running the following command as root: +sysctl -w vm.max_map_count=3000000 + +To set this value permanently, update the vm.max_map_count setting in /etc/sysctl.conf. +vm.swappiness +vm.swappiness controls how much the virtual memory subsystem will try to swap to disk. By default, the kernel configures this value to 60, and will try to swap out items in memory even when there is plenty of RAM available to the system. +We recommend sizing clusters such that M3DB is running on a substrate (hosts/containers) such that no-swapping is necessary, i.e. the process is only using 30-50% of the maximum available memory. And therefore recommend setting the value of vm.swappiness to 1. This tells the kernel to swap as little as possible, without altogether disabling swapping. +On Linux, you can configure this by running the following as root: +sysctl -w vm.swappiness=1 + +To set this value permanently, update the vm.swappiness setting in /etc/sysctl.conf. +rlimits +M3DB also can use a high number of files and we suggest setting a high max open number of files due to per partition fileset volumes. +You may need to override the system and process-level limits set by the kernel with the following commands. To check the existing values run: +sysctl -n fs.file-max + +and +sysctl -n fs.nr_open + +to see the kernel and process limits respectively. If either of the values are less than three million (our minimum recommended value), then you can update them with the following commands: +sysctl -w fs.file-max=3000000 + +sysctl -w fs.nr_open=3000000 + +To set these values permanently, update the fs.file-max and fs.nr_open settings in /etc/sysctl.conf. +Alternatively, if you wish to have M3DB run under systemd you can use our service example which will set sane defaults. Keep in mind that you'll still need to configure the kernel and process limits because systemd will not allow a process to exceed them and will silently fallback to a default value which could cause M3DB to crash due to hitting the file descriptor limit. Also note that systemd has a system.conf file and a user.conf file which may contain limits that the service-specific configuration files cannot override. Be sure to check that those files aren't configured with values lower than the value you configure at the service level. +Before running the process make sure the limits are set, if running manually you can raise the limit for the current user with ulimit -n 3000000. + diff --git a/docs-beta/content/getting_started/kube.md b/docs-beta/content/getting_started/kube.md index 89b7b852d6..db28f7ac1d 100644 --- a/docs-beta/content/getting_started/kube.md +++ b/docs-beta/content/getting_started/kube.md @@ -1,6 +1,213 @@ --- -title: "Kube" +title: "In Kubernetes" date: 2020-04-21T20:47:43-04:00 draft: true --- +M3DB on Kubernetes +Please note: If possible PLEASE USE THE OPERATOR to deploy to Kubernetes if you can. It is a considerly more streamlined setup. +The operator leverages custom resource definitions (CRDs) to automatically handle operations such as managing cluster topology. +The guide below provides static manifests to bootstrap a cluster on Kubernetes and should be considered as a guide to running M3 on Kubernetes, if and only if you have significant custom requirements not satisified by the operator. +Prerequisites +M3DB performs better when it has access to fast disks. Every incoming write is written to a commit log, which at high volumes of writes can be sensitive to spikes in disk latency. Additionally the random seeks into files when loading cold files benefit from lower random read latency. +Because of this, the included manifests reference a StorageClass named fast. Manifests are provided to provide such a StorageClass on AWS / Azure / GCP using the respective cloud provider's premium disk class. +If you do not already have a StorageClass named fast, create one using one of the provided manifests: +# AWS EBS (class io1) +kubectl apply -f https://raw.githubusercontent.com/m3db/m3/master/kube/storage-fast-aws.yaml + +# Azure premium LRS +kubectl apply -f https://raw.githubusercontent.com/m3db/m3/master/kube/storage-fast-azure.yaml + +# GCE Persistent SSD +kubectl apply -f https://raw.githubusercontent.com/m3db/m3/master/kube/storage-fast-gcp.yaml + +If you wish to use your cloud provider's default remote disk, or another disk class entirely, you'll have to modify them manifests. +If your Kubernetes cluster spans multiple availability zones, it's important to specify a Volume Binding Mode of WaitForFirstConsumer in your StorageClass to delay the binding of the PersistentVolume until the Pod is created. +Kernel Configuration +We provide a Kubernetes daemonset that can make setting host-level sysctls easier. Please see the kernel docs for more. +Note that our default StatefulSet spec will give the M3DB container CAP_SYS_RESOURCE so it may raise its file limits. Uncomment the securityContext on the m3db container in the StatefulSet if running with a Pod Security Policy or similar enforcement mechanism that prevents adding capabilities to containers. +Deploying +Apply the following manifest to create your cluster: +kubectl apply -f https://raw.githubusercontent.com/m3db/m3/master/kube/bundle.yaml + +Applying this bundle will create the following resources: +An m3db Namespace for all M3DB-related resources. +A 3-node etcd cluster in the form of a StatefulSet backed by persistent remote SSDs. This cluster stores the DB topology and other runtime configuration data. +A 3-node M3DB cluster in the form of a StatefulSet. +Headless services for the etcd and m3db StatefulSets to provide stable DNS hostnames per-pod. +Wait until all created pods are listed as ready: +$ kubectl -n m3db get po +NAME READY STATUS RESTARTS AGE +etcd-0 1/1 Running 0 22m +etcd-1 1/1 Running 0 22m +etcd-2 1/1 Running 0 22m +m3dbnode-0 1/1 Running 0 22m +m3dbnode-1 1/1 Running 0 22m +m3dbnode-2 1/1 Running 0 22m + +You can now proceed to initialize a namespace and placement for the cluster the same as you would for our other how-to guides: +# Open a local connection to the coordinator service: +$ kubectl -n m3db port-forward svc/m3coordinator 7201 +Forwarding from 127.0.0.1:7201 -> 7201 +Forwarding from [::1]:7201 -> 7201 + +# Create an initial cluster topology +curl -sSf -X POST localhost:7201/api/v1/placement/init -d '{ + "num_shards": 1024, + "replication_factor": 3, + "instances": [ + { + "id": "m3dbnode-0", + "isolation_group": "pod0", + "zone": "embedded", + "weight": 100, + "endpoint": "m3dbnode-0.m3dbnode:9000", + "hostname": "m3dbnode-0.m3dbnode", + "port": 9000 + }, + { + "id": "m3dbnode-1", + "isolation_group": "pod1", + "zone": "embedded", + "weight": 100, + "endpoint": "m3dbnode-1.m3dbnode:9000", + "hostname": "m3dbnode-1.m3dbnode", + "port": 9000 + }, + { + "id": "m3dbnode-2", + "isolation_group": "pod2", + "zone": "embedded", + "weight": 100, + "endpoint": "m3dbnode-2.m3dbnode:9000", + "hostname": "m3dbnode-2.m3dbnode", + "port": 9000 + } + ] +}' + +# Create a namespace to hold your metrics +curl -X POST localhost:7201/api/v1/namespace -d '{ + "name": "default", + "options": { + "bootstrapEnabled": true, + "flushEnabled": true, + "writesToCommitLog": true, + "cleanupEnabled": true, + "snapshotEnabled": true, + "repairEnabled": false, + "retentionOptions": { + "retentionPeriodDuration": "720h", + "blockSizeDuration": "12h", + "bufferFutureDuration": "1h", + "bufferPastDuration": "1h", + "blockDataExpiry": true, + "blockDataExpiryAfterNotAccessPeriodDuration": "5m" + }, + "indexOptions": { + "enabled": true, + "blockSizeDuration": "12h" + } + } +}' + +Shortly after you should see your nodes finish bootstrapping: +$ kubectl -n m3db logs -f m3dbnode-0 +21:36:54.831698[I] cluster database initializing topology +21:36:54.831732[I] cluster database resolving topology +21:37:22.821740[I] resolving namespaces with namespace watch +21:37:22.821813[I] updating database namespaces [{adds [metrics]} {updates []} {removals []}] +21:37:23.008109[I] node tchannelthrift: listening on 0.0.0.0:9000 +21:37:23.008384[I] cluster tchannelthrift: listening on 0.0.0.0:9001 +21:37:23.217090[I] node httpjson: listening on 0.0.0.0:9002 +21:37:23.217240[I] cluster httpjson: listening on 0.0.0.0:9003 +21:37:23.217526[I] bootstrapping shards for range starting [{run bootstrap-data} {bootstrapper filesystem} ... +... +21:37:23.239534[I] bootstrap data fetched now initializing shards with series blocks [{namespace metrics} {numShards 256} {numSeries 0}] +21:37:23.240778[I] bootstrap finished [{namespace metrics} {duration 23.325194ms}] +21:37:23.240856[I] bootstrapped +21:37:29.733025[I] successfully updated topology to 3 hosts + +You can now write and read metrics using the API on the DB nodes: +$ kubectl -n m3db port-forward svc/m3dbnode 9003 +Forwarding from 127.0.0.1:9003 -> 9003 +Forwarding from [::1]:9003 -> 9003 + +curl -sSf -X POST localhost:9003/writetagged -d '{ + "namespace": "default", + "id": "foo", + "tags": [ + { + "name": "city", + "value": "new_york" + }, + { + "name": "endpoint", + "value": "/request" + } + ], + "datapoint": { + "timestamp": '"$(date "+%s")"', + "value": 42.123456789 + } +}' + +$ curl -sSf -X POST http://localhost:9003/query -d '{ + "namespace": "default", + "query": { + "regexp": { + "field": "city", + "regexp": ".*" + } + }, + "rangeStart": 0, + "rangeEnd": '"$(date "+%s")"' +}' | jq . + +{ + "results": [ + { + "id": "foo", + "tags": [ + { + "name": "city", + "value": "new_york" + }, + { + "name": "endpoint", + "value": "/request" + } + ], + "datapoints": [ + { + "timestamp": 1527630053, + "value": 42.123456789 + } + ] + } + ], + "exhaustive": true +} + +Adding nodes +You can easily scale your M3DB cluster by scaling the StatefulSet and informing the cluster topology of the change: +kubectl -n m3db scale --replicas=4 statefulset/m3dbnode + +Once the pod is ready you can modify the cluster topology: +kubectl -n m3db port-forward svc/m3coordinator 7201 +Forwarding from 127.0.0.1:7201 -> 7201 +Forwarding from [::1]:7201 -> 7201 + +curl -sSf -X POST localhost:7201/api/v1/placement -d '{ + "instances": [ + { + "id": "m3dbnode-3", + "isolation_group": "pod3", + "zone": "embedded", + "weight": 100, + "endpoint": "m3dbnode-3.m3dbnode:9000", + "hostname": "m3dbnode-3.m3dbnode", + "port": 9000 + } + ] +}' diff --git a/docs-beta/content/getting_started/m3_binary.md b/docs-beta/content/getting_started/m3_binary.md index 7c14749b89..5e68e63479 100644 --- a/docs-beta/content/getting_started/m3_binary.md +++ b/docs-beta/content/getting_started/m3_binary.md @@ -1,6 +1,158 @@ --- -title: "M3_binary" +title: "From M3 Binary" date: 2020-04-21T20:47:36-04:00 draft: true --- +M3DB Cluster Deployment, Manually (The Hard Way) +Introduction +This document lists the manual steps involved in deploying a M3DB cluster. In practice, you'd be automating this using Terraform or using Kubernetes rather than doing this by hand; guides for doing so are available under the How-To section. +Primer Architecture +A quick primer on M3DB architecture. Here’s what a typical deployment looks like: + +A few different things to highlight about the diagram: +Role Type +There are three ‘role types’ for a m3db deployment - +Coordinator: m3coordinator serves to coordinate reads and writes across all hosts in the cluster. It’s a lightweight process, and does not store any data. This role would typically be run alongside a Prometheus instance, or be baked into a collector agent. +Storage Node: m3dbnode processes running on these hosts are the workhorses of the database, they store data; and serve reads and writes. +Seed Node: First and foremost, these hosts are storage nodes themselves. In addition to that responsibility, they run an embedded ETCD server. This is to allow the various M3DB processes running across the cluster to reason about the topology/configuration of the cluster in a consistent manner. +Note: In very large deployments, you’d use a dedicated ETCD cluster, and only use M3DB Storage and Coordinator Nodes +Provisioning +Enough background, lets get you going with a real cluster! Provision your host (be it VMs from AWS/GCP/etc) or bare-metal servers in your DC with the latest and greatest flavour of Linux you favor. M3DB works on all popular distributions - Ubuntu/RHEL/CentOS, let us know if you run into issues on another platform and we’ll be happy to assist. +Network +If you’re using AWS or GCP it is highly advised to use static IPs so that if you need to replace a host, you don’t have to update your configuration files on all the hosts, you simply decomission the old seed node and provision a new seed node with the same host ID and static IP that the old seed node had. For AWS you can use a Elastic Network Interface on a VPC and for GCP you can simply use an internal static IP address. +In this example you will be creating three static IP addresses for the three seed nodes. +Further, we assume you have hostnames configured correctly too. i.e. running hostname on a host in the cluster returns the host ID you'll be using when specifying instance host IDs when creating the M3DB cluster placement. E.g. running hostname on a node m3db001 should return it's host ID m3db001. +In GCP the name of your instance when you create it will automatically be it's hostname. When you create an instance click "Management, disks, networking, SSH keys" and under "Networking" click the default interface and click the "Primary internal IP" drop down and select "Reserve a static internal IP address" and give it a name, i.e. m3db001, a description that describes it's a seed node IP address and use "Assign automatically". +In AWS it might be simpler to just use whatever the hostname you get for the provisioned VM as your host ID when specifying M3DB placement. Either that or use the environment host ID resolver and pass your host ID when launching the database process with an environment variable. You can set to the host ID and specify the environment variable name in config as envVarName: M3DB_HOST_ID if you are using an environment variable named M3DB_HOST_ID. +Relevant config snippet: +hostID: + resolver: environment + envVarName: M3DB_HOST_ID + +Then start your process with: +M3DB_HOST_ID=m3db001 m3dbnode -f config.yml + +Kernel +Ensure you review our recommended kernel configuration before running M3DB in production as M3DB may exceed the default limits for some default kernel values. +Config files +We wouldn’t feel right to call this guide, “The Hard Way” and not require you to change some configs by hand. +Note: the steps that follow assume you have the following 3 seed nodes - make necessary adjustment if you have more or are using a dedicated ETCD cluster. Example seed nodes: +m3db001 (Region=us-east1, Zone=us-east1-a, Static IP=10.142.0.1) +m3db002 (Region=us-east1, Zone=us-east1-b, Static IP=10.142.0.2) +m3db003 (Region=us-east1, Zone=us-east1-c, Static IP=10.142.0.3) +We’re going to start with the M3DB config template and modify it to work for your cluster. Start by downloading the config. Update the config ‘service’ and 'seedNodes' sections to read as follows: +config: + service: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - 10.142.0.1:2379 + - 10.142.0.2:2379 + - 10.142.0.3:2379 + seedNodes: + initialCluster: + - hostID: m3db001 + endpoint: http://10.142.0.1:2380 + - hostID: m3db002 + endpoint: http://10.142.0.2:2380 + - hostID: m3db003 + endpoint: http://10.142.0.3:2380 + +Start the seed nodes +Transfer the config you just crafted to each host in the cluster. And then starting with the seed nodes, start up the m3dbnode process: +m3dbnode -f + +Note, remember to daemon-ize this using your favourite utility: systemd/init.d/supervisor/etc +Create Namespace and Initialize Topology +The recommended way to create a namespace and initialize a topology is to use the /api/v1/database/create api. Below is an example. +Note: In order to create a more custom setup, please refer to the namespace configuration and placement configuration guides, though this is discouraged. +curl -X POST http://localhost:7201/api/v1/database/create -d '{ + "type": "cluster", + "namespaceName": "1week_namespace", + "retentionTime": "168h", + "numShards": "1024", + "replicationFactor": "3", + "hosts": [ + { + "id": "m3db001", + "isolationGroup": "us-east1-a", + "zone": "embedded", + "weight": 100, + "address": "10.142.0.1", + "port": 9000 + }, + { + "id": "m3db002", + "isolationGroup": "us-east1-b", + "zone": "embedded", + "weight": 100, + "address": "10.142.0.2", + "port": 9000 + }, + { + "id": "m3db003", + "isolationGroup": "us-east1-c", + "zone": "embedded", + "weight": 100, + "address": "10.142.0.3", + "port": 9000 + } + ] +}' + +Note: Isolation group specifies how the cluster places shards to avoid more than one replica of a shard appearing in the same replica group. As such you must be using at least as many isolation groups as your replication factor. In this example we use the availibity zones us-east1-a, us-east1-b, us-east1-c as our isolation groups which matches our replication factor of 3. +Shortly after, you should see your node complete bootstrapping: +20:10:12.911218[I] updating database namespaces [{adds [default]} {updates []} {removals []}] +20:10:13.462798[I] node tchannelthrift: listening on 0.0.0.0:9000 +20:10:13.463107[I] cluster tchannelthrift: listening on 0.0.0.0:9001 +20:10:13.747173[I] node httpjson: listening on 0.0.0.0:9002 +20:10:13.747506[I] cluster httpjson: listening on 0.0.0.0:9003 +20:10:13.747763[I] bootstrapping shards for range starting ... +... +20:10:13.757834[I] bootstrap finished [{namespace metrics} {duration 10.1261ms}] +20:10:13.758001[I] bootstrapped +20:10:14.764771[I] successfully updated topology to 3 hosts + +If you need to setup multiple namespaces, you can run the above /api/v1/database/create command multiple times with different namespace configurations. +Replication factor (RF) +Recommended is RF3, where each replica is spread across failure domains such as a rack, data center or availability zone. See Replication Factor Recommendations for more specifics. +Shards +See placement configuration to determine the appropriate number of shards to specify. +Test it out +Now you can experiment with writing tagged metrics: +curl -sS -X POST localhost:9003/writetagged -d '{ + "namespace": "metrics", + "id": "foo", + "tags": [ + { + "name": "city", + "value": "new_york" + }, + { + "name": "endpoint", + "value": "/request" + } + ], + "datapoint": { + "timestamp": '"$(date "+%s")"', + "value": 42.123456789 + } +}' + +And reading the metrics you've written: +curl -sS -X POST http://localhost:9003/query -d '{ + "namespace": "metrics", + "query": { + "regexp": { + "field": "city", + "regexp": ".*" + } + }, + "rangeStart": 0, + "rangeEnd": '"$(date "+%s")"' +}' | jq . diff --git a/docs-beta/content/how_to_guides/_index.md b/docs-beta/content/how_to_guides/_index.md index 8aa91990fd..34c5ac5914 100644 --- a/docs-beta/content/how_to_guides/_index.md +++ b/docs-beta/content/how_to_guides/_index.md @@ -1,14 +1,13 @@ +++ title = "How To Guides" date = 2020-04-01T19:26:56-04:00 -weight = 3 +weight = 5 chapter = true -pre = "3. " +pre = "5. " +++ -### Section 3 +### Section 4 # How To Guides -Aggregation, Sending Metrics (for Prometheus and Graphite), and Querying (for Prometheus and Graphite) diff --git a/docs-beta/content/how_to_guides/aggregation/_index.md b/docs-beta/content/how_to_guides/aggregation/_index.md index 1c2ed5248c..8588861031 100644 --- a/docs-beta/content/how_to_guides/aggregation/_index.md +++ b/docs-beta/content/how_to_guides/aggregation/_index.md @@ -4,3 +4,37 @@ date: 2020-04-02T12:41:51-04:00 draft: true --- +Aggregation +You will notice that in the setup linked above, M3DB has just one unaggregated namespace configured. If you want aggregated metrics, you will need to set up an aggregated namespace in M3DB and in the m3query configuration. It is important to note that all writes go to all namespaces so as long as you include all namespaces in your query config, you will be querying all namespaces. Aggregation is done strictly by the query service. For example if you have an aggregated namespace setup in M3DB named metrics_10s_48h, you can add the following to the query config: +- namespace: metrics_10s_48h + type: aggregated + retention: 48h + resolution: 10s + +Disabling automatic aggregation +If you run Statsite, m3agg, or some other aggregation tier, you will want to set the all flag under downsample to false. Otherwise, you will be aggregating metrics that have already been aggregated. +- namespace: metrics_10s_48h + type: aggregated + retention: 48h + resolution: 10s + downsample: + all: false + +ID generation +The default generation scheme for IDs, legacy, is unfortunately prone to collisions, but remains the default for backwards compatibility reasons. It is suggested to set the ID generation scheme to one of either quoted or prepend_meta. quoted generation scheme yields the most human-readable IDs, whereas prepend_meta is better for more compact IDs, or if tags are expected to contain non-ASCII characters. To set the ID generation scheme, add the following to your m3coordinator configuration yaml file: +tagOptions: + idScheme: + +As an example of how these schemes generate IDs, consider a series with the following 4 tags, [{"t1":v1}, {t2:"v2"}, {t3:v3}, {t4:v4}]. The following is an example of how different schemes will generate IDs. +legacy: "t1"=v1,t2="v2",t3=v3,t4=v4, +prepend_meta: 4,2,2,4,2,2,2,2!"t1"v1t2"v2"t3v3t4v4 +quoted: {\"t1\"="v1",t2="\"v2\"",t3="v3",t4="v4"} + +If there is a chance that your metric tags will contain "control" characters, specifically , and =, it is highly recommended that one of either the quoted or prepend_meta schemes are specified, as the legacy scheme may cause ID collisions. As a general guideline, we suggest quoted, as it mirrors the more familiar Prometheus style IDs. +We technically have a fourth ID generation scheme that is used for Graphite IDs, but it is exclusive to the Graphite ingestion path and is not selectable as a general scheme. +WARNING: Once a scheme is selected, be very careful about changing it. If changed, all incoming metrics will resolve to a new ID, effectively doubling the metric cardinality until all of the older-style metric IDs fall out of retention. +Migration +We recently updated our ID generation scheme in m3coordinator to avoid the collision issues discussed above. To ease migration, we're temporarily enforcing that an ID generation scheme be explicitly provided in the m3coordinator configuration files. +If you have been running m3query or m3coordinator already, you may want to counterintuitively select the collision-prone legacy scheme, as all the IDs for all of your current metrics would have already been generated with this scheme, and choosing another will effectively double your index size. If the twofold increase in cardinality is an acceptable increase (and unfortunately, this is likely to mean doubled cardinality until your longest retention cluster rotates out), it's suggested to choose a collision-resistant scheme instead. +An example of a configuration file for a standalone m3query instance with the ID generation scheme can be found here. If you're running m3query or m3coordinator embedded, these configuration options should be nested under the coordinator: heading, as seen here. +If none of these options work for you, or you would like further clarification, please stop by our gitter channel and we'll be happy to help you. diff --git a/docs-beta/content/how_to_guides/aggregation/namespace.md b/docs-beta/content/how_to_guides/aggregation/namespace.md index 5d831efe54..e7170da7a2 100644 --- a/docs-beta/content/how_to_guides/aggregation/namespace.md +++ b/docs-beta/content/how_to_guides/aggregation/namespace.md @@ -1,5 +1,5 @@ --- -title: "Namespace" +title: "Adding a namespace" date: 2020-04-21T20:55:04-04:00 draft: true --- diff --git a/docs-beta/content/how_to_guides/aggregation/rules_policies.md b/docs-beta/content/how_to_guides/aggregation/rules_policies.md index a386d888b4..7b8553f8bf 100644 --- a/docs-beta/content/how_to_guides/aggregation/rules_policies.md +++ b/docs-beta/content/how_to_guides/aggregation/rules_policies.md @@ -4,3 +4,51 @@ date: 2020-04-02T12:36:33-04:00 draft: true --- +Configuring Mapping & Rollup Rules +Mapping Rules +Mapping rules are used to configure the storage policy for metrics. The storage policy determines how long to store metrics for and at what resolution to keep them at. For example, a storage policy of 1m:48h tells M3 to keep the metrics for 48hrs at a 1min resolution. Mapping rules can be configured in the m3coordinator configuration file under the downsample > rules > mappingRules stanza. We will use the following as an example. +downsample: + rules: + mappingRules: + - name: "mysql metrics" + filter: "app:mysql*" + aggregations: ["Last"] + storagePolicies: + - resolution: 1m + retention: 48h + - name: "nginx metrics" + filter: "app:nginx*" + aggregations: ["Last"] + storagePolicies: + - resolution: 30s + retention: 24h + - resolution: 1m + retention: 48h + +Here, we have two mapping rules configured -- one for mysql metrics and one for nginx metrics. The filter determines what metrics each rule applies to. The mysql metrics rule will apply to any metrics where the app tag contains mysql* as the value (* being a wildcard). Similarly, the nginx metrics rule will apply to all metrics where the app tag contains nginx* as the value. +The aggregations field determines what functions to apply to the datapoints within a resolution tile. For example, if an application emits a metric every 10sec and the resolution for that metrics's storage policy is 1min, M3 will need to combine 6 datapoints. If the aggregations policy is Last, M3 will take the last value in that 1min bucket. aggregations can be one of the following: +Last +Min +Max +Mean +Median +Count +Sum +SumSq +Stdev +P10 +P20 +P30 +P40 +P50 +P60 +P70 +P80 +P90 +P95 +P99 +P999 +P9999 + +Lastly, the storagePolicies field determines which namespaces to store the metrics in. For example, the mysql metrics will be sent to the 1m:48h namespace, while the nginx metrics will be sent to both the 1m:48h and 30s:24h namespaces. +Note: the namespaces listed under the storagePolicies stanza must exist in M3DB. diff --git a/docs-beta/content/how_to_guides/grafana.md b/docs-beta/content/how_to_guides/grafana.md index cd49b3c001..1c3d759615 100644 --- a/docs-beta/content/how_to_guides/grafana.md +++ b/docs-beta/content/how_to_guides/grafana.md @@ -1,6 +1,35 @@ --- -title: "Grafana" +title: "Integrating with Grafana" date: 2020-04-21T20:53:35-04:00 draft: true --- +Grafana +You can also set up m3query as a datasource in Grafana. To do this, add a new datasource with a type of Prometheus. The URL should point to the host/port running m3query. By default, m3query runs on port 7201. + + +Querying With Grafana +When using the Prometheus integration with Grafana, there are two different ways you can query for your metrics. The first option is to configure Grafana to query Prometheus directly by following these instructions. +Alternatively, you can configure Grafana to read metrics directly from M3Coordinator in which case you will bypass Prometheus entirely and use M3's PromQL engine instead. To set this up, follow the same instructions from the previous step, but set the url to: http://:7201. + + +Querying +M3 supports the the majority of graphite query functions and can be used to query metrics that were ingested via the ingestion pathway described above. +Grafana +M3Coordinator implements the Graphite source interface, so you can add it as a graphite source in Grafana by following these instructions. +Note that you'll need to set the URL to: http://:7201/api/v1/graphite +Direct +You can query for metrics directly by issuing HTTP GET requests directly against the M3Coordinator /api/v1/graphite/render endpoint which runs on port 7201 by default. For example: +(export now=$(date +%s) && curl "localhost:7201/api/v1/graphite/render?target=transformNull(foo.*.baz)&from=$(($now-300))" | jq .) + +will query for all metrics matching the foo.*.baz pattern, applying the transformNull function, and returning all datapoints for the last 5 minutes. + +Grafana +M3 supports a variety of Grafana integrations. +Prometheus / Graphite Sources +M3Coordinator can function as a datasource for Prometheus as well as Graphite. See the Prometheus integration and Graphite integration documents respectively for more information. +Pre-configured Prometheus Dashboards +All M3 applications expose Prometheus metrics on port 7203 by default as described in the Prometheus integration guide, so if you're already monitoring your M3 stack with Prometheus and Grafana you can use our pre-configured dashboards. +M3DB Prometheus / Grafana dashboard +M3Coordinator Prometheus / Grafana dashboard: TODO +Alternatively, you can obtain the JSON for our most up-to-date dashboards from our git repo directly. diff --git a/docs-beta/content/how_to_guides/graphite.md b/docs-beta/content/how_to_guides/graphite.md index 7907c45605..4bca2e9e20 100644 --- a/docs-beta/content/how_to_guides/graphite.md +++ b/docs-beta/content/how_to_guides/graphite.md @@ -1,6 +1,112 @@ --- -title: "Graphite" +title: "Integrating with Graphite" date: 2020-04-21T20:52:20-04:00 draft: true --- +Graphite +This document is a getting started guide to integrating the M3 stack with Graphite. +Overview +M3 supports ingesting Graphite metrics using the Carbon plaintext protocol. We also support a variety of aggregation and storage policies for the ingestion pathway (similar to storage-schemas.conf when using Graphite Carbon) that are documented below. Finally, on the query side, we support the majority of graphite query functions. +Ingestion +Setting up the M3 stack to ingest carbon metrics is straightforward. First, make sure you've followed our other documentation to get m3coordinator and M3DB setup. Also, familiarize yourself with how M3 handles aggregation. +Once you have both of those services running properly, modify your m3coordinator configuration to add the following lines and restart it: +carbon: + ingester: + listenAddress: "0.0.0.0:7204" + +This will enable a line-based TCP carbon ingestion server on the specified port. By default, the server will write all carbon metrics to every aggregated namespace specified in the m3coordinator configuration file and aggregate them using a default strategy of mean (equivalent to Graphite's Average). +This default setup makes sense if your carbon metrics are unaggregated, however, if you've already aggregated your data using something like statsite then you may want to disable M3 aggregation. In that case, you can do something like the following: +carbon: + ingester: + listenAddress: "0.0.0.0:7204" + rules: + - pattern: .* + aggregation: + enabled: false + policies: + - resolution: 1m + retention: 48h + +This replaces M3's default behavior with a single rule which states that all metrics (since .* will match any string) should be written to whichever aggregated M3DB namespace has been configured with a resolution of 1 minute and a retention of 48 hours, bypassing aggregation / downsampling altogether. Note that there must be a configured M3DB namespace with the specified resolution/retention or the coordinator will fail to start. +In the situation that you choose to use M3's aggregation functionality, there are a variety of aggregation types you can choose from. For example: +carbon: + ingester: + listenAddress: "0.0.0.0:7204" + rules: + - pattern: .* + aggregation: + type: last + policies: + - resolution: 1m + retention: 48h + +The config above will aggregate ingested carbon metrics into 1 minute tiles, but instead of taking the mean of every datapoint, it will emit the last datapoint that was received within a given tile's window. +Similar to Graphite's storage-schemas.conf, M3 carbon ingestion rules are applied in order and only the first pattern that matches is applied. In addition, the rules can be as simple or as complex as you like. For example: +carbon: + ingester: + listenAddress: "0.0.0.0:7204" + rules: + - pattern: stats.internal.financial-service.* + aggregation: + type: max + policies: + - resolution: 1m + retention: 4320h + - resolution: 10s + retention: 24h + - pattern: stats.internal.rest-proxy.* + aggregation: + type: mean + policies: + - resolution: 10s + retention: 2h + - pattern: stats.cloud.* + aggregation: + enabled: false + policies: + - resolution: 1m + retention: 2h + - pattern: .* + aggregation: + type: mean + policies: + - resolution: 1m + retention: 48h + +Lets break that down. +The first rule states that any metric matching the pattern stats.internal.financial-service.* should be aggregated using the max function (meaning the datapoint with the highest value that is received in a given window will be retained) to generate two different tiles, one with 1 minute resolution and another with 10 second resolution which will be written out to M3DB namespaces with 4320 hour and 24 hour retentions respectively. +The second rule will aggregate all the metrics coming from our rest-proxy service using a mean type aggregation (all datapoints within a given window will be averaged) to generate 10 second tiles and write them out to an M3DB namespace that stores data for two hours. +The third will match any metrics coming from our cloud environment. In this hypoethical example, our cloud metrics are already aggregated using an application like statsite, so instead of aggregating them again, we just write them directly to an M3DB namespace that retains data for two hours. Note that while we're not aggregating the data in M3 here, we still need to provide a resolution so that the ingester can match the storage policy to a known M3DB namespace, as well as so that when we fan out queries to multiple namespaces we know the resolution of the data contained in each namespace. +Finally, our last rule uses a "catch-all" pattern to capture any metrics that don't match any of our other rules and aggregate them using the mean function into 1 minute tiles which we store for 48 hours. +Debug mode +If at any time you're not sure which metrics are being matched by which patterns, or want more visibility into how the carbon ingestion rule are being evaluated, modify the config to enable debug mode: +carbon: + ingester: + debug: true + listenAddress: "0.0.0.0:7204" + +This will make the carbon ingestion emit logs for every step that is taking. Note: If your coordinator is ingesting a lot of data, enabling this mode could bring the proccess to a halt due to the I/O overhead, so use this feature cautiously in production environments. +Supported Aggregation Functions +last +min +max +mean +median +count +sum +sumsq +stdev +p10 +p20 +p30 +p40 +p50 +p60 +p70 +p80 +p90 +p95 +p99 +p999 +p9999 diff --git a/docs-beta/content/how_to_guides/monitoring_m3.md b/docs-beta/content/how_to_guides/monitoring_m3.md deleted file mode 100644 index 0cb3481d4d..0000000000 --- a/docs-beta/content/how_to_guides/monitoring_m3.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "Monitoring_m3" -date: 2020-04-21T20:49:29-04:00 -draft: true ---- - diff --git a/docs-beta/content/how_to_guides/monitoring_m3/tracing.md b/docs-beta/content/how_to_guides/monitoring_m3/tracing.md index b7c0c307eb..2649b1e982 100644 --- a/docs-beta/content/how_to_guides/monitoring_m3/tracing.md +++ b/docs-beta/content/how_to_guides/monitoring_m3/tracing.md @@ -4,3 +4,44 @@ date: 2020-04-21T20:56:40-04:00 draft: true --- +Tracing +M3DB is integrated with opentracing to provide insight into query performance and errors. +Jaeger +To enable Jaeger as the tracing backend, set tracing.backend to "jaeger" (see also our sample local config: +tracing: + backend: jaeger # enables jaeger with default configs + jaeger: + # optional configuration for jaeger -- see + # https://github.com/jaegertracing/jaeger-client-go/blob/master/config/config.go#L37 + # for options + ... + +Jaeger can be run locally with docker as described in https://www.jaegertracing.io/docs/1.9/getting-started/. +The default configuration will report traces via udp to localhost:6831; using the all-in-one jaeger container, they will be accessible at +http://localhost:16686 +N.B.: for production workloads, you will almost certainly want to use sampler.type=remote with adaptive sampling for Jaeger, as write volumes are likely orders of magnitude higher than read volumes in most timeseries systems. +LightStep +To use LightStep as the tracing backend, set tracing.backend to "lightstep" and configure necessary information for your client under lightstep. Any options exposed in lightstep-tracer-go can be set in config. Any environment variables may be interpolated. For example: +tracing: + serviceName: m3coordinator + backend: lightstep + lightstep: + access_token: ${LIGHTSTEP_ACCESS_TOKEN:""} + collector: + scheme: https + host: my-satellite-address.domain + port: 8181 + +Alternative backends +If you'd like additional backends, we'd love to support them! +File an issue against M3 and we can work with you on how best to add the backend. The first time's going to be a little rough--opentracing unfortunately doesn't support Go plugins (yet--see https://github.com/opentracing/opentracing-go/issues/133), and glide's update model means that adding dependencies directly will update everything, which isn't ideal for an isolated dependency change. These problems are all solvable though, and we'll work with you to make it happen! +Use cases +Note: all URLs assume a local jaeger setup as described in Jaeger's docs. +Finding slow queries +To find prom queries longer than , filter for minDuration >= on operation="GET /api/v1/query_range". +Sample query: http://localhost:16686/search?end=1548876672544000&limit=20&lookback=1h&maxDuration&minDuration=1ms&operation=GET%20%2Fapi%2Fv1%2Fquery_range&service=m3query&start=1548873072544000 +Finding queries with errors +Search for error=true on operation="GET /api/v1/query_range" http://localhost:16686/search?operation=GET%20%2Fapi%2Fv1%2Fquery_range&service=m3query&tags=%7B%22error%22%3A%22true%22%7D +Finding 500 (Internal Server Error) responses +Search for http.status_code=500. +http://localhost:16686/search?limit=20&lookback=24h&maxDuration&minDuration&operation=GET%20%2Fapi%2Fv1%2Fquery_range&service=m3query&start=1548802430108000&tags=%7B"http.status_code"%3A"500"% diff --git a/docs-beta/content/how_to_guides/other/sql.md b/docs-beta/content/how_to_guides/other/sql.md index bc3c796e16..b9a6e6611f 100644 --- a/docs-beta/content/how_to_guides/other/sql.md +++ b/docs-beta/content/how_to_guides/other/sql.md @@ -1,5 +1,5 @@ --- -title: "Sql" +title: "I. Querying using SQL" date: 2020-04-21T20:50:09-04:00 draft: true --- diff --git a/docs-beta/content/how_to_guides/other/tsdb.md b/docs-beta/content/how_to_guides/other/tsdb.md index 7855911a27..ad4edc74a4 100644 --- a/docs-beta/content/how_to_guides/other/tsdb.md +++ b/docs-beta/content/how_to_guides/other/tsdb.md @@ -1,6 +1,126 @@ --- -title: "Tsdb" +title: "II. Integrating with general TSDB" date: 2020-04-21T20:50:45-04:00 draft: true --- +Using M3DB as a general purpose time series database +Overview +M3 has native integrations that make it particularly easy to use it as a metrics storage for Prometheus and Graphite. M3DB can also be used as a general purpose distributed time series database by itself. +Data Model +IDs and Tags +M3DB's data model allows multiple namespaces, each of which can be configured and tuned independently. +Each namespace can also be configured with its own schema (see "Schema Modeling" section below). +Within a namespace, each time series is uniquely identified by an ID which can be any valid string / byte array. In addition, tags can be attached to any series which makes the series queryable using the inverted index. +M3DB's inverted index supports term (exact match) and regular expression queries over all tag values, and individual tag queries can be arbitrarily combined using AND, OR, and NOT operators. +For example, imagine an application that tracks a fleet of vehicles. One potential structure for the time series could be as follows: + + +Timeseries 1 +Timeseries 2 +Timeseries 3 +Timeseries 4 +Timeseries ID +vehicle_id_1 +vehicle_id_2 +vehicle_id_3 +vehicle_id_4 +"type" tag value +sedan +bike +scooter +scooter +"city" tag value +san_francisco +san_francisco +new_york +chicago +"version" tag value +0_1_0 +0_1_0 +0_1_1 +0_1_2 +This would allow users to issue queries that answer questions like: +"What time series IDs exist for any vehicle type operating in San Francisco?" +"What time series IDs exist for scooters that are NOT operating in Chicago?" +"What time series IDs exist where the "version" tag matches the regular expression: 0_1_[12]" +TODO(rartoul): Discuss the ability to perform limited amounts of aggregation queries here as well. +TODO(rartoul): Discuss ID / tags mutability. +Data Points +Each time series in M3DB stores data as a stream of data points in the form of tuples. Timestamp resolution can be as granular as individual nanoseconds. +The value portion of the tuple is a Protobuf message that matches the configured namespace schema, which requires that all values in the current time series must also match this schema. This limitation may be lifted in the future. +Schema Modeling +Every M3DB namespace can be configured with a Protobuf-defined schema that every value in the time series must conform to +For example, continuing with the vehicle fleet tracking example introduced earlier, a schema might look as follows: +syntax = "proto3"; + +message VehicleLocation { + double latitude = 1; + double longitude = 2; + double fuel_percent = 3; + string status = 4; +} + +While M3DB strives to support the entire proto3 language spec, only the following features are currently supported: +Scalar values +Nested messages +Repeated fields +Map fields +Reserved fields +The following features are currently not supported: +Any fields +Oneof fields +Options of any type +Custom field types +Compression +While M3DB supports schemas that contain nested messages, repeated fields, and map fields, currently it can only effectively compress top level scalar fields. For example, M3DB can compress every field in the following schema: +syntax = "proto3"; + +message VehicleLocation { + double latitude = 1; + double longitude = 2; + double fuel_percent = 3; + string status = 4; +} + +however, it will not apply any form of compression to the attributes field in this schema: +syntax = "proto3"; + +message VehicleLocation { + double latitude = 1; + double longitude = 2; + double fuel_percent = 3; + string status = 4; + map attributes = 5; +} + +While the latter schema is valid, the attributes field will not be compressed; users should weigh the tradeoffs between more expressive schema and better compression for each use case. +For more details on the compression scheme and its limitations, review the documentation for M3DB's compressed Protobuf encoding. +Getting Started +M3DB setup +For more advanced setups, it's best to follow the guides on how to configure an M3DB cluster manually or using Kubernetes. However, this tutorial will walk you through configuring a single node setup locally for development. +First, run the following command to pull the latest M3DB image: +docker pull quay.io/m3db/m3dbnode:latest + +Next, run the following command to start the M3DB container: +docker run -p 7201:7201 -p 7203:7203 -p 9000:9000 -p 9001:9001 -p 9002:9002 -p 9003:9003 -p 9004:9004 -p 2379:2379 --name m3db -v $(pwd)/m3db_data:/var/lib/m3db -v $(pwd)/src/dbnode/config/m3dbnode-local-etcd-proto.yml:/etc/m3dbnode/m3dbnode.yml -v :/etc/m3dbnode/default_schema.proto quay.io/m3db/m3dbnode:latest + +Breaking that down: +All the -p flags expose the necessary ports. +The -v $(pwd)/m3db_data:/var/lib/m3db section creates a bind mount that enables M3DB to persist data between container restarts. +The -v :/etc/m3dbnode/m3dbnode.yml section mounts the specified configuration file in the container which allows configuration changes by restarting the container (rather than rebuilding it). This example file can be used as a good starting point. It configures the database to have the Protobuf feature enabled and expects one namespace with the name default and a Protobuf message name of VehicleLocation for the schema. You'll need to update that portion of the config if you intend to use a different schema than the example one used throughout this document. Note that hard-coding paths to the schema should only be done for local development and testing. For production use-cases, M3DB supports storing the current schema in etcd so that it can be update dynamically. TODO(rartoul): Document how to do that as well as what kind of schema changes are safe / backwards compatible. +The -v :/etc/m3dbnode/default_schema.proto section mounts the Protobuf file containing the schema in the container, similar to the configuration file this allows the schema to be changed by restarting the container instead of rebuilding it. You can use this example schema as a starting point. Is is also the same example schema that is used by the sample Go program discussed below in the "Clients" section. Also see the bullet point above about not hard coding schema files in production. +Once the M3DB container has started, issue the following CURL statement to create the default namespace: +curl -X POST http://localhost:7201/api/v1/database/create -d '{ + "type": "local", + "namespaceName": "default", + "retentionTime": "4h" +}' + +Note that the retentionTime is set artificially low to conserve resources. +After a few moments, the M3DB container should finish bootstrapping. At this point it should be ready to serve write and read queries. +Clients +Note: M3DB only has a Go client; this is unlikely to change in the future due to the fact that the client is "fat" and contains a substantial amount of logic that would be difficult to port to other languages. +Users interested in interacting with M3DB directly from Go applications can reference this runnable example to get an understanding of how to interact with M3DB in Go. Note that the example above uses the same default namespace and VehicleLocation schema used throughout this document so it can be run directly against an M3DB docker container setup using the "M3DB setup" instructions above. +M3DB will eventually support other languages by exposing an M3Coordinator endpoint which will allow users to write/read from M3DB directly using GRPC/JSON. + diff --git a/docs-beta/content/how_to_guides/other/upgrading.md b/docs-beta/content/how_to_guides/other/upgrading.md index 65e31bd309..569732cae6 100644 --- a/docs-beta/content/how_to_guides/other/upgrading.md +++ b/docs-beta/content/how_to_guides/other/upgrading.md @@ -1,6 +1,68 @@ --- -title: "Upgrading" +title: "III. Upgrading M3" date: 2020-04-21T20:50:39-04:00 draft: true --- +Upgrading M3 +Overview +This guide explains how to upgrade M3 from one version to another (e.g. from 0.14.0 to 0.15.0). This includes upgrading: +m3dbnode +m3coordinator +m3query +m3aggregator +m3dbnode +Graphs to monitor +While upgrading M3DB nodes, it's important to monitor the status of bootstrapping the individual nodes. This can be monitored using the M3DB Node Details graph. Typically, the Bootstrapped graph under Background Tasks and the graphs within the CPU and Memory Utilization give a good understanding of how well bootstrapping is going. +Non-Kubernetes +It is very important that for each replica set, only one node gets upgraded at a time. However, multiple nodes can be upgraded across replica sets. +1) Download new binary (linux example below). +wget "https://github.com/m3db/m3/releases/download/v$VERSION/m3_$VERSION_linux_amd64.tar.gz" && tar xvzf m3_$VERSION_linux_amd64.tar.gz && rm m3_$VERSION_linux_amd64.tar.gz + +2) Stop and upgrade one M3DB node at a time per replica set using the systemd unit. +# stop m3dbnode +sudo systemctl stop m3dbnode + +# start m3dbnode with the new binary (which should be placed in the path specified in the systemd unit) +sudo systemctl start m3dbnode + +Note: If unable to stop m3dbnode using systemctl, use pkill instead. +# stop m3dbnode +pkill m3dbnode + +# start m3dbnode with new binary +./m3_$VERSION_linux_amd64/m3dbnode -f + +3) Confirm m3dbnode has finished bootstrapping. +20:10:12.911218[I] updating database namespaces [{adds [default]} {updates []} {removals []}] +20:10:13.462798[I] node tchannelthrift: listening on 0.0.0.0:9000 +20:10:13.463107[I] cluster tchannelthrift: listening on 0.0.0.0:9001 +20:10:13.747173[I] node httpjson: listening on 0.0.0.0:9002 +20:10:13.747506[I] cluster httpjson: listening on 0.0.0.0:9003 +20:10:13.747763[I] bootstrapping shards for range starting ... +... +20:10:13.757834[I] bootstrap finished [{namespace metrics} {duration 10.1261ms}] +20:10:13.758001[I] bootstrapped +20:10:14.764771[I] successfully updated topology to 3 hosts + +4) Repeat steps 2 and 3 until all nodes have been upgraded. +Kubernetes +If running M3DB on Kubernetes, upgrade by completing the following steps. +Identify the version of m3dbnode to upgrade to on Quay. +Replace the Docker image in the StatefulSet manifest (or m3db-operator manifest) to be the new version of m3dbnode. +spec: + image: quay.io/m3db/m3dbnode:$VERSION + +Once updated, apply the updated manifest and a rolling restart will be performed. +kubectl apply -f + +Downgrading +The upgrading steps above can also be used to downgrade M3DB. However, it is important to refer to the release notes to make sure that versions are backwards compatible. +m3coordinator +m3coordinator can be upgraded using similar steps as m3dbnode, however, the images can be found here instead. +m3query +m3query can be upgraded using similar steps as m3dbnode, however, the images can be found here instead. +m3aggregator +m3aggregator can be upgraded using similar steps as m3dbnode, however, the images can be found here instead. + +Integrations diff --git a/docs-beta/content/how_to_guides/prometheus.md b/docs-beta/content/how_to_guides/prometheus.md index 16b8421e65..b65777265c 100644 --- a/docs-beta/content/how_to_guides/prometheus.md +++ b/docs-beta/content/how_to_guides/prometheus.md @@ -1,6 +1,115 @@ --- -title: "Prometheus" +title: "Integrating with Prometheus" date: 2020-04-21T20:48:58-04:00 draft: true --- +Prometheus +As mentioned in our integrations guide, M3DB can be used as a remote read/write endpoint for Prometheus. +If you run Prometheus on your Kubernetes cluster you can easily point it at M3DB in your Prometheus server config: +remote_read: + - url: "http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/read" + # To test reading even when local Prometheus has the data + read_recent: true + +remote_write: + - url: "http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/write" + # To differentiate between local and remote storage we will add a storage label + write_relabel_configs: + - target_label: metrics_storage + replacement: m3db_remote + + +Prometheus +This document is a getting started guide to integrating M3DB with Prometheus. +M3 Coordinator configuration +To write to a remote M3DB cluster the simplest configuration is to run m3coordinator as a sidecar alongside Prometheus. +Start by downloading the config template. Update the namespaces and the client section for a new cluster to match your cluster's configuration. +You'll need to specify the static IPs or hostnames of your M3DB seed nodes, and the name and retention values of the namespace you set up. You can leave the namespace storage metrics type as unaggregated since it's required by default to have a cluster that receives all Prometheus metrics unaggregated. In the future you might also want to aggregate and downsample metrics for longer retention, and you can come back and update the config once you've setup those clusters. You can read more about our aggregation functionality here. +It should look something like: +listenAddress: + type: "config" + value: "0.0.0.0:7201" + +logging: + level: info + +metrics: + scope: + prefix: "coordinator" + prometheus: + handlerPath: /metrics + listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved + sanitization: prometheus + samplingRate: 1.0 + extended: none + +tagOptions: + idScheme: quoted + +clusters: + - namespaces: +# We created a namespace called "default" and had set it to retention "48h". + - namespace: default + retention: 48h + type: unaggregated + client: + config: + service: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: +# We have five M3DB nodes but only three are seed nodes, they are listed here. + - M3DB_NODE_01_STATIC_IP_ADDRESS:2379 + - M3DB_NODE_02_STATIC_IP_ADDRESS:2379 + - M3DB_NODE_03_STATIC_IP_ADDRESS:2379 + writeConsistencyLevel: majority + readConsistencyLevel: unstrict_majority + writeTimeout: 10s + fetchTimeout: 15s + connectTimeout: 20s + writeRetry: + initialBackoff: 500ms + backoffFactor: 3 + maxRetries: 2 + jitter: true + fetchRetry: + initialBackoff: 500ms + backoffFactor: 2 + maxRetries: 3 + jitter: true + backgroundHealthCheckFailLimit: 4 + backgroundHealthCheckFailThrottleFactor: 0.5 + +Now start the process up: +m3coordinator -f + +Or, use the docker container: +docker pull quay.io/m3db/m3coordinator:latest +docker run -p 7201:7201 --name m3coordinator -v :/etc/m3coordinator/m3coordinator.yml quay.io/m3db/m3coordinator:latest + +Prometheus configuration +Add to your Prometheus configuration the m3coordinator sidecar remote read/write endpoints, something like: +remote_read: + - url: "http://localhost:7201/api/v1/prom/remote/read" + # To test reading even when local Prometheus has the data + read_recent: true +remote_write: + - url: "http://localhost:7201/api/v1/prom/remote/write" + +Also, we recommend adding M3DB and M3Coordinator/M3Query to your list of jobs under scrape_configs so that you can monitor them using Prometheus. With this scraping setup, you can also use our pre-configured M3DB Grafana dashboard. +- job_name: 'm3db' + static_configs: + - targets: [':7203', ':7203', ':7203'] +- job_name: 'm3coordinator' + static_configs: + - targets: [':7203'] + +NOTE: If you are running M3DB with embedded M3Coordinator, you should only have one job. We recommend just calling this job m3. For example: +- job_name: 'm3' + static_configs: + - targets: [':7203'] diff --git a/docs-beta/content/intro_m3/_index.md b/docs-beta/content/intro_m3/_index.md new file mode 100644 index 0000000000..f4b2c25b00 --- /dev/null +++ b/docs-beta/content/intro_m3/_index.md @@ -0,0 +1,37 @@ ++++ +title = "Introduction to M3" +date = 2020-04-22T15:05:49-04:00 +weight = 1 +chapter = true +pre = "1. " ++++ + +### Introduction to M3 + +### About +After using open-source metrics solutions and finding issues with them at scale, such as reliability, cost, and operational complexity, M3 was created from the ground up to provide Uber with a native, distributed time series database, a highly-dynamic and performant aggregation service, a query engine, and other supporting infrastructure. + +### Key Features +M3 has several features, provided as discrete components, which make it an ideal platform for time series data at scale: + +* A distributed time series database, M3DB, that provides scalable storage for time series data and a reverse index. +* A sidecar process, M3Coordinator, that allows M3DB to act as the long-term storage for Prometheus. +* A distributed query engine, M3Query, with native support for PromQL and Graphite (M3QL coming soon). +* An aggregation tier, M3Aggregator, that runs as a dedicated metrics aggregator/downsampler allowing metrics to be stored at various retentions at different resolutions. + + +## Components +### M3 Coordinator +M3 Coordinator is a service that coordinates reads and writes between upstream systems, such as Prometheus, and M3DB. It is a bridge that users can deploy to access the benefits of M3DB such as long term storage and multi-DC setup with other monitoring systems, such as Prometheus. See this presentation for more on long term storage in Prometheus. + +### M3DB +M3DB is a distributed time series database that provides scalable storage and a reverse index of time series. It is optimized as a cost effective and reliable realtime and long term retention metrics store and index. For more details, see the M3DB documentation. + +### M3 Query +M3 Query is a service that houses a distributed query engine for querying both realtime and historical metrics, supporting several different query languages. It is designed to support both low latency realtime queries and queries that can take longer to execute, aggregating over much larger datasets, for analytical use cases. For more details, see the query engine documentation. + +### M3 Aggregator +M3 Aggregator is a service that runs as a dedicated metrics aggregator and provides stream based downsampling, based on dynamic rules stored in etcd. It uses leader election and aggregation window tracking, leveraging etcd to manage this state, to reliably emit at-least-once aggregations for downsampled metrics to long term storage. This provides cost effective and reliable downsampling & roll up of metrics. These features also reside in the M3 Coordinator, however the dedicated aggregator is sharded and replicated, whereas the M3 Coordinator is not and requires care to deploy and run in a highly available way. There is work remaining to make the aggregator more accessible to users without requiring them to write their own compatible producer and consumer. + +## Motivation +We decided to open source the M3 platform as a scalable remote storage backend for Prometheus and Graphite so that others may attempt to reuse our work and avoid building yet another scalable metrics platform. As documentation for Prometheus states, it is limited by single nodes in its scalability and durability. The M3 platform aims to provide a turnkey, scalable, and configurable multi-tenant store for Prometheus, Graphite and other standard metrics schemas. diff --git a/docs-beta/content/intro_m3/overview.md b/docs-beta/content/intro_m3/overview.md deleted file mode 100644 index e0c54cf7f6..0000000000 --- a/docs-beta/content/intro_m3/overview.md +++ /dev/null @@ -1,13 +0,0 @@ -+++ -title = "Overview" -date = 2020-04-21T20:44:20-04:00 -weight = 5 -chapter = true -pre = "X. " -+++ - -### Chapter X - -# Some Chapter title - -Lorem Ipsum. \ No newline at end of file diff --git a/docs-beta/content/media/_index.md b/docs-beta/content/media/_index.md index f38b50d3c8..36e2e2015a 100644 --- a/docs-beta/content/media/_index.md +++ b/docs-beta/content/media/_index.md @@ -1,12 +1,12 @@ +++ title = "Media" date = 2020-04-01T20:23:23-04:00 -weight = 11 +weight = 10 chapter = true -pre = "11. " +pre = "10. " +++ -### Section 11 +### Section 10 # Media Coverage diff --git a/docs-beta/content/operational_guides/_index.md b/docs-beta/content/operational_guides/_index.md index a0c7407b8a..93d2284690 100644 --- a/docs-beta/content/operational_guides/_index.md +++ b/docs-beta/content/operational_guides/_index.md @@ -1,9 +1,9 @@ +++ -title = "Operational_guides" +title = "Operational Guides" date = 2020-04-21T20:57:48-04:00 -weight = 5 +weight = 6 chapter = true -pre = "X. " +pre = "6. " +++ ### Chapter X diff --git a/docs-beta/content/operational_guides/managing_aggregator.md b/docs-beta/content/operational_guides/managing_aggregator.md index dd8ac89117..85985590b4 100644 --- a/docs-beta/content/operational_guides/managing_aggregator.md +++ b/docs-beta/content/operational_guides/managing_aggregator.md @@ -1,5 +1,5 @@ --- -title: "Managing_aggregator" +title: "Managing M3 Aggregator" date: 2020-04-21T20:59:23-04:00 draft: true --- diff --git a/docs-beta/content/operational_guides/managing_m3db.md b/docs-beta/content/operational_guides/managing_m3db.md deleted file mode 100644 index 8213ceadbd..0000000000 --- a/docs-beta/content/operational_guides/managing_m3db.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "Managing_m3db" -date: 2020-04-21T20:58:30-04:00 -draft: true ---- - diff --git a/docs-beta/content/operational_guides/managing_m3db/_index.md b/docs-beta/content/operational_guides/managing_m3db/_index.md index 3a46e6fd95..c3ac24dc96 100644 --- a/docs-beta/content/operational_guides/managing_m3db/_index.md +++ b/docs-beta/content/operational_guides/managing_m3db/_index.md @@ -1,6 +1,103 @@ --- -title: "Managing_m3db" +title: "Managing M3DB" date: 2020-04-21T21:00:08-04:00 draft: true --- +M3DB, a distributed time series database +About +M3DB, inspired by Gorilla and Cassandra, is a distributed time series database released as open source by Uber Technologies. It can be used for storing realtime metrics at long retention. +Here are some attributes of the project: +Distributed time series storage, single nodes use a WAL commit log and persists time windows per shard independently +Cluster management built on top of etcd +Built-in synchronous replication with configurable durability and read consistency (one, majority, all, etc) +M3TSZ float64 compression inspired by Gorilla TSZ compression, configurable as lossless or lossy +Arbitrary time precision configurable from seconds to nanoseconds precision, able to switch precision with any write +Configurable out of order writes, currently limited to the size of the configured time window's block size +Current Limitations +Due to the nature of the requirements for the project, which are primarily to reduce the cost of ingesting and storing billions of timeseries and providing fast scalable reads, there are a few limitations currently that make M3DB not suitable for use as a general purpose time series database. +The project has aimed to avoid compactions when at all possible, currently the only compactions M3DB performs are in-memory for the mutable compressed time series window (default configured at 2 hours). As such out of order writes are limited to the size of a single compressed time series window. Consequently backfilling large amounts of data is not currently possible. +The project has also optimized the storage and retrieval of float64 values, as such there is no way to use it as a general time series database of arbitrary data structures just yet. + +Architecture +Overview +M3DB is written entirely in Go and does not have any required dependencies. For larger deployments, one may use an etcd cluster to manage M3DB cluster membership and topology definition. +High Level Goals +Some of the high level goals for the project are defined as: +Monitoring support: M3DB was primarily developed for collecting a high volume of monitoring time series data, distributing the storage in a horizontally scalable manner and most efficiently leveraging the hardware. As such time series that are not read frequently are not kept in memory. +Highly configurable: Provide a high level of configuration to support a wide set of use cases and runtime environments. +Variable durability: Providing variable durability guarantees for the write and read side of storing time series data enables a wider variety of applications to use M3DB. This is why replication is primarily synchronous and is provided with configurable consistency levels, to enable consistent writes and reads. It must be possible to use M3DB with strong guarantees that data was replicated to a quorum of nodes and that the data was durable if desired. +Storage Engine Overview +M3DB is a time series database that was primarily designed to be horizontally scalable and able to handle high data throughput. +Time Series Compression +One of M3DB's biggest strengths as a time series database (as opposed to using a more general-purpose horizontally scalable, distributed database like Cassandra) is its ability to compress time series data resulting in huge memory and disk savings. There are two compression algorithms used in M3DB: M3TSZ and protobuf encoding. +M3TSZ +M3TSZ is used when values are floats. A variant of the streaming time series compression algorithm described in Facebook's Gorilla paper, it achieves a high compression ratio. The compression ratio will vary depending on the workload and configuration, but we found that we were able to achieve a compression ratio of 1.45 bytes/datapoint with Uber's production workloads. This was a 40% improvement over standard TSZ, which only gave us a compression ratio of 2.42 bytes/datapoint under the same conditions. +Protobuf Encoding +For more complex value types, M3DB also supports generic Protobuf messages with a few exceptions. The algorithm takes on a hybrid approach and uses different compression schemes depending on the field types within the Protobuf message. +Details on the encoding, marshaling and unmarshaling methods can be read here. + +The in-memory portion of M3DB is implemented via a hierarchy of objects: +A database of which there is only one per M3DB process. The database owns multiple namespaces. +A namespace is similar to a table in other databases. Each namespace has a unique name and a set of configuration options, such as data retention and block size (which we will discuss in more detail later). A namespace owns multiple shards. +A shard is effectively the same as a "virtual shard" in Cassandra in that it provides an arbitrary distribution of time series data via a simple hash of the series ID. A shard owns multiple series. +A series represents a sequence of time series datapoints. For example, the CPU utilization for a host could be represented as a series with the ID "host1.system.cpu.utilization" and a vector of (TIMESTAMP, CPU_LEVEL) tuples. Visualizing this example in a graph, there would a single line with time on the x-axis and CPU utilization on the y-axis. A series owns a buffer and any cached blocks. +The buffer is where all data that has yet to be written to disk gets stored in memory. This includes both new writes to M3DB and data obtained through bootstrapping. More details on the buffer is explained below. Upon flushing, the buffer creates a block of its data to be persisted to disk. +A block represents a stream of compressed time series data for a pre-configured block size, for example, a block could hold data for 6-8PM (block size of two hours). A block can arrive directly into the series only as a result of getting cached after a read request. Since blocks are in a compressed format, individual datapoints cannot be read from it. In other words, in order to read a single datapoint, the entire block up to that datapoint needs to be decompressed beforehand. +Persistent storage +While in-memory databases can be useful (and M3DB supports operating in a memory-only mode), some form of persistence is required for durability. In other words, without a persistence strategy, it would be impossible for M3DB to restart (or recover from a crash) without losing all of its data. +In addition, with large volumes of data, it becomes prohibitively expensive to keep all of the data in memory. This is especially true for monitoring workloads which often follow a "write-once, read-never" pattern where less than a few percent of all the data that's stored is ever read. With that type of workload, it's wasteful to keep all of that data in memory when it could be persisted on disk and retrieved when required. +M3DB takes a two-pronged approach to persistant storage that involves combining a commit log for disaster recovery with periodic flushing (writing fileset files to disk) for efficient retrieval: +All writes are persisted to a commit log (the commit log can be configured to fsync every write, or optionally batch writes together which is much faster but leaves open the possibility of small amounts of data loss in the case of a catastrophic failure). The commit log is completely uncompressed and exists only to recover unflushed data in the case of a database shutdown (intentional or not) and is never used to satisfy a read request. +Periodically (based on the configured block size), all data in the buffer is flushed to disk as immutable fileset files. These files are highly compressed and can be indexed into via their complementary index files. Check out the flushing section to learn more about the background flushing process. +The block size parameter is the most important variable that needs to be tuned for a particular workload. A small block size will mean more frequent flushing and a smaller memory footprint for the data that is being actively compressed, but it will also reduce the compression ratio and data will take up more space on disk. +If the database is stopped for any reason in between flushes, then when the node is started back up those writes will be recovered by reading the commit log or streaming in the data from a peer responsible for the same shard (if the replication factor is larger than one). +While the fileset files are designed to support efficient data retrieval via the series ID, there is still a heavy cost associated with any query that has to retrieve data from disk because going to disk is always much slower than accessing main memory. To compensate for that, M3DB supports various caching policies which can significantly improve the performance of reads by caching data in memory. + + +Storage +Overview +The primary unit of long-term storage for M3DB are fileset files which store compressed streams of time series values, one per shard block time window size. +They are flushed to disk after a block time window becomes unreachable, that is the end of the time window for which that block can no longer be written to. If a process is killed before it has a chance to flush the data for the current time window to disk it must be restored from the commit log (or a peer that is responsible for the same shard if replication factor is larger than 1.) +FileSets +A fileset has the following files: +Info file: Stores the block time window start and size and other important metadata about the fileset volume. +Summaries file: Stores a subset of the index file for purposes of keeping the contents in memory and jumping to section of the index file that within a few pages of linear scanning can find the series that is being looked up. +Index file: Stores the series metadata, including tags if indexing is enabled, and location of compressed stream in the data file for retrieval. +Data file: Stores the series compressed data streams. +Bloom filter file: Stores a bloom filter bitset of all series contained in this fileset for quick knowledge of whether to attempt retrieving a series for this fileset volume. +Digests file: Stores the digest checksums of the info file, summaries file, index file, data file and bloom filter file in the fileset volume for integrity verification. +Checkpoint file: Stores a digest of the digests file and written at the succesful completion of a fileset volume being persisted, allows for quickly checking if a volume was completed. + ┌─────────────────────┐ +┌─────────────────────┐ ┌─────────────────────┐ │ Index File │ +│ Info File │ │ Summaries File │ │ (sorted by ID) │ +├─────────────────────┤ │ (sorted by ID) │ ├─────────────────────┤ +│- Block Start │ ├─────────────────────┤ ┌─>│- Idx │ +│- Block Size │ │- Idx │ │ │- ID │ +│- Entries (Num) │ │- ID │ │ │- Size │ +│- Major Version │ │- Index Entry Offset ├──┘ │- Checksum │ +│- Summaries (Num) │ └─────────────────────┘ │- Data Entry Offset ├──┐ +│- BloomFilter (K/M) │ │- Encoded Tags | | +│- Snapshot Time │ └─────────────────────┘ │ +│- Type (Flush/Snap) │ │ +└─────────────────────┘ │ + │ + ┌─────────────────────┐ ┌───────────────────────────┘ +┌─────────────────────┐ │ Bloom Filter File │ │ +│ Digests File │ ├─────────────────────┤ │ ┌─────────────────────┐ +├─────────────────────┤ │- Bitset │ │ │ Data File │ +│- Info file digest │ └─────────────────────┘ │ ├─────────────────────┤ +│- Summaries digest │ │ │List of: │ +│- Index digest │ └─>│ - Marker (16 bytes)│ +│- Data digest │ │ - ID │ +│- Bloom filter digest│ │ - Data (size bytes)│ +└─────────────────────┘ └─────────────────────┘ + +┌─────────────────────┐ +│ Checkpoint File │ +├─────────────────────┤ +│- Digests digest │ +└─────────────────────┘ + +In the diagram above you can see that the data file stores compressed blocks for a given shard / block start combination. The index file (which is sorted by ID and thus can be binary searched or scanned) can be used to find the offset of a specific ID. +FileSet files will be kept for every shard / block start combination that is within the retention period. Once the files fall out of the period defined in the configurable namespace retention period they will be deleted. diff --git a/docs-beta/content/operational_guides/managing_m3db/etcd.md b/docs-beta/content/operational_guides/managing_m3db/etcd.md index 5b5de4bbed..33a81cf42a 100644 --- a/docs-beta/content/operational_guides/managing_m3db/etcd.md +++ b/docs-beta/content/operational_guides/managing_m3db/etcd.md @@ -4,3 +4,65 @@ date: 2020-04-21T20:58:58-04:00 draft: true --- +etcd +The configuration file linked above uses an embedded etcd cluster, which is fine for development purposes. However, if you wish to use this in production, you will want an external etcd cluster. + + +etcd +The M3 stack leverages etcd as a distributed key-value storage to: +Update cluster configuration in realtime +Manage placements for our distributed / sharded tiers like M3DB and M3Aggregator +Perform leader-election in M3Aggregator +and much more! +Overview +M3DB ships with support for running embedded etcd (called seed nodes), and while this is convenient for testing and development, we don't recommend running with this setup in production. +Both M3 and etcd are complex distributed systems, and trying to operate both within the same binary is challenging and dangerous for production workloads. +Instead, we recommend running an external etcd cluster that is isolated from the M3 stack so that performing operations like node adds, removes, and replaces are easier. +While M3 relies on etcd to provide strong consistency, the perations we use it for are all low-throughput so you should be able to operate a very low maintenance etcd cluster. A 3-node setup for high availability should be more than sufficient for most workloads. +Configuring an External etcd Cluster +M3DB +Most of our documentation demonstrates how to run M3DB with embedded etcd nodes. Once you're ready to switch to an external etcd cluster, all you need to do is modify the M3DB config to remove the seedNodes field entirely and then change the endpoints under etcdClusters to point to your external etcd nodes instead of the M3DB seed nodes. +For example this portion of the config +config: + service: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - http://m3db_seed1:2379 + - http://m3db_seed2:2379 + - http://m3db_seed3:2379 + seedNodes: + initialCluster: + - hostID: m3db_seed1 + endpoint: http://m3db_seed1:2380 + - hostID: m3db_seed2 + endpoint: http://m3db_seed2:2380 + - hostID: m3db_seed3 + endpoint: http://m3db_seed3:2380 + +would become +config: + service: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - http://external_etcd1:2379 + - http://external_etcd2:2379 + - http://external_etcd3:2379 + +Note: M3DB placements and namespaces are stored in etcd so if you want to switch to an external etcd cluster you'll need to recreate all your placements and namespaces. You can do this manually or use etcdctl's Mirror Maker functionality. +M3Coordinator +M3Coordinator does not run embedded etcd, so configuring it to use an external etcd cluster is simple. Just replace the endpoints under etcdClusters in the YAML config to point to your external etcd nodes instead of the M3DB seed nodes. See the M3DB example above for a detailed before/after comparison of the YAML config. +etcd Operations +Embedded etcd +If you're running M3DB seed nodes with embedded etcd (which we do not recommend for production workloads) and need to perform a node add/replace/remove then follow our placement configuration guide and pay special attention to follow the special instructions for seed nodes. +External etcd +Just follow the instructions in the etcd docs. diff --git a/docs-beta/content/operational_guides/managing_m3db/m3db_node_mgmt.md b/docs-beta/content/operational_guides/managing_m3db/m3db_node_mgmt.md index f85db80647..d53b97709e 100644 --- a/docs-beta/content/operational_guides/managing_m3db/m3db_node_mgmt.md +++ b/docs-beta/content/operational_guides/managing_m3db/m3db_node_mgmt.md @@ -1,6 +1,131 @@ --- -title: "M3db_node_mgmt" +title: "M3DB node management" date: 2020-04-21T20:58:41-04:00 draft: true --- +Write Path +We now have enough context of M3DB's architecture to discuss the lifecycle of a write. A write begins when an M3DB client calls the writeBatchRaw endpoint on M3DB's embedded thrift server. The write itself will contain the following information: +The namespace +The series ID (byte blob) +The timestamp +The value itself +M3DB will consult the database object to check if the namespace exists, and if it does, then it will hash the series ID to determine which shard it belongs to. If the node receiving the write owns that shard, then it will lookup the series in the shard object. If the series exists, then an encoder in the buffer will encode the datapoint into the compressed stream. If the encoder doesn't exist (no writes for this series have occurred yet as part of this block) then a new encoder will be allocated and it will begin a compressed M3TSZ stream with that datapoint. There is also some additional logic for handling multiple encoders and filesets which is discussed in the buffer section. +At the same time, the write will be appended to the commit log, which is periodically compacted via a snapshot process. Details of this is outlined in the commit log page. +Note: Regardless of the success or failure of the write in a single node, the client will return a success or failure to the caller for the write based on the configured consistency level. +Read Path +A read begins when an M3DB client calls the FetchBatchResult or FetchBlocksRawResult endpoint on M3DB's embedded thrift server. The read request will contain the following information: +The namespace +The series ID (byte blob) +The period of time being requested (start and end) +M3DB will consult the database object to check if the namespace exists, and if it does, it will hash the series ID to determine which shard it belongs to. If the node receiving the read owns that shard, then M3DB needs to determine two things: +Whether the series exists and if it does, +Whether the data exists in the buffer, cached in-memory, on disk, or some combination of all three. +Determining whether the series exists is simple. M3DB looks up the series in the shard object. If it exists, then the series exists. If it doesn't, then M3DB consults in-memory bloom filters(s) for all shard/block start combinations(s) that overlap the query range to determine if the series exists on disk. +If the series exists, then for every block that the request spans, M3DB needs to consolidate data from the buffer, in-memory cache, and fileset files (disk). +Let's imagine a read for a given series that requests the last 6 hours worth of data, and an M3DB namespace that is configured with a block size of 2 hours, i.e. we need to find 3 different blocks. +If the current time is 8PM, then the location of the requested blocks might be as follows: +[2PM - 4PM (fileset file)] - Flushed block that isn't cached +[4PM - 6PM (in-memory cache)] - Flushed block that is cached +[4PM - 6PM (cold write in active buffer)] - Cold write that hasn't been flushed yet +[6PM - 8PM (active buffer)] - Hasn't been flushed yet + +Then M3DB will need to consolidate: +The not-yet-sealed block from the buffer (located inside an internal lookup in the Series object) [6PM - 8PM] +The in-memory cached block (also located inside an internal lookup in the Series object). Since there are also cold writes in this block, the cold writes will be consolidated in memory with data found in the cached block before returning. [4PM - 6PM] +The block from disk (the block will be retrieved from disk and will then be cached according to the current caching policy) [2PM - 4PM] +Retrieving blocks from the buffer and in-memory cache is simple, the data is already present in memory and easily accessible via hashmaps keyed by series ID. Retrieving a block from disk is more complicated. The flow for retrieving a block from disk is as follows: +Consult the in-memory bloom filter to determine if it's possible the series exists on disk. +If the bloom filter returns negative, we are sure that the series isn't there, so return that result. If the bloom filter returns positive, then binary search the in-memory index summaries to find the nearest index entry that is before the series ID that we're searching for. Review the index_lookup.go file for implementation details. +Jump to the offset in the index file that we obtained from the binary search in the previous step, and begin scanning forward until we identify the index entry for the series ID we're looking for or we get far enough in the index file that it becomes clear that the ID we're looking for doesn't exist (this is possible because the index file is sorted by ID) +Jump to the offset in the data file that we obtained from scanning the index file in the previous step, and begin streaming data. +Once M3DB has retrieved the three blocks from their respective locations in memory / on-disk, it will transmit all of the data back to the client. Whether or not the client returns a success to the caller for the read is dependent on the configured consistency level. +Note: Since M3DB nodes return compressed blocks (the M3DB client decompresses them), it's not possible to return "partial results" for a given block. If any portion of a read request spans a given block, then that block in its entirety must be transmitted back to the client. In practice, this ends up being not much of an issue because of the high compression ratio that M3DB is able to achieve. + +Buffer +Each series object contains a buffer, which is in charge of handling all data that has yet to be flushed - new writes and bootstrapped data. To accomplish this, it keeps mutable "buckets" of encoders (for new writes) and immutable blocks (for bootstrapped data). M3TSZ, the database's encoding scheme, is designed for compressing time series data in which each datapoint has a timestamp that is larger than the last encoded datapoint. For metrics workloads this works very well because every subsequent datapoint is almost always after the previous one. However, out of order writes will occasionally be received, for example due to clock skew. When this happens, M3DB will allocate a new encoder for the out of order datapoints. These encoders are contained in a bucket along with any blocks that got bootstrapped. +Upon a flush (discussed further below), all data within a bucket gets merged and its version gets incremented - the specific version it gets set to depends on the number of times this block has previously been flushed. This bucket versioning allows the buffer to know which data has been flushed so that subsequent flushes will not try to flush it again. It also indicates to the clean up process (also discussed below) that that data can be evicted. +Given this complex, concurrent logic, this has been modeled in TLA. + ┌─────────────────────────┐ + │ Buffer │ + ├─────────────────────────┤ + │ │ + │ ┌─────────────────┐ │ + │ │ 2-4PM buckets │ │ + │ └─────────────────┘ │ + │ │ + │ ┌─────────────────┐ │ + ┌────│───│ 4-6PM buckets │ | + │ │ └─────────────────┘ │ + │ │ │ + │ │ ... │ + │ └─────────────────────────┘ + │ + │ + v After flush: + ┌─────────────────────┐ ┌─────────────────────┐ + │ 4-6PM buckets │ │ 4-6PM buckets │ + ├─────────────────────┤ ├─────────────────────┤ + │ │ │ │ + │ ┌─────────────┐ │ │ ┌─────────────┐ │ + │ │ Bucket v0 │<--│--writes │ │ Bucket v3 │ │ + │ └─────────────┘ │ │ └─────────────┘ │ + │ │ │ │ + │ │ │ │ + │ │ │ │ + │ │ │ │ + │ │ │ │ + └─────────────────────┘ └─────────────────────┘ + + + More writes after flush: After clean up: + ┌─────────────────────┐ ┌─────────────────────┐ + │ 4-6PM buckets │ │ 4-6PM buckets │ + ├─────────────────────┤ ├─────────────────────┤ + │ │ │ │ + │ ┌─────────────┐ │ │ ┌─────────────┐ │ + │ │ Bucket v3 │ │ │ │ Bucket v0 │<--│--writes + │ └─────────────┘ │ │ └─────────────┘ │ + │ │ │ │ + │ ┌─────────────┐ │ │ │ + │ │ Bucket v0 │<--│--writes │ │ + │ └─────────────┘ │ │ │ + │ │ │ │ + └─────────────────────┘ └─────────────────────┘ + +Background processes +M3DB has a variety of processes that run in the background during normal operation. +Flushing +As discussed in the architecture section, writes are actively buffered / compressed in memory and the commit log is continuously being written to, but eventually data needs to be flushed to disk in the form of fileset files to facilitate efficient storage and retrieval. +This is where the configurable "block size" comes into play. The block size is simply a duration of time that dictates how long new writes will be compressed (in a streaming manner) in memory before being flushed to disk. Let's use a block size of two hours as an example. +If the block size is set to two hours, then all writes for all series for a given shard will be buffered in memory for two hours at a time. At the end of the two hour period all of the fileset files will be generated, written to disk, and then the in-memory objects can be released and replaced with new ones for the new block. The old objects will be removed from memory in the subsequent tick. +If a flush happens for a namespace/shard/series/block for which there is already a fileset, in-memory data will get merged with data on disk from the fileset. The resultant merged data will then be flushed as a separate fileset. +Ticking +The ticking process runs continously in the background and is responsible for a variety of tasks: +Merging all encoders for a given series / block start combination +Removing expired / flushed series and blocks from memory +Clean up of expired data (fileset/commit log) from the filesystem +Merging all encoders +If there are multiple encoders for a block, they need to be merged before flushing the data to disk. To prevent huge memory spikes during the flushing process we continuously merge out of order encoders in the background. +Removing expired / flushed series and blocks from memory +Depending on the configured caching policy, the in-memory object layout can end up with references to series or data blocks that are expired (have fallen out of the retention period) or no longer needed to be in memory (due to the data being flushed to disk or no longer needing to be cached). The background tick will identify these structures and release them from memory. +Clean up of expired data +Fileset files can become no longer necessary for two reasons: +The fileset files for a block that has fallen out of retention +A flush occurred for a block that already has a fileset file. The new fileset will be a superset of the existing fileset with any new data that for that block, hence, the existing fileset is no longer required +During the clean up process, these fileset files will get deleted. +Caveats / Limitations +Currently M3DB does not support deletes. +M3DB does not support storing data with an indefinite retention period, every namespace in M3DB is required to have a retention policy which specifies how long data in that namespace will be retained for. While there is no upper bound on that value, it's still required and generally speaking M3DB is optimized for workloads with a well-defined TTL. +M3DB does not support either background data repair or Cassandra-style read repairs. Future versions of M3DB will support automatic repairs of data as an ongoing background process. +M3DB does not support writing far into the future. Support for this will be added in future + + +Cluster operations +Node add +When a node is added to the cluster it is assigned shards that relieves load fairly from the existing nodes. The shards assigned to the new node will become INITIALIZING, the nodes then discover they need to be bootstrapped and will begin bootstrapping the data using all replicas available. The shards that will be removed from the existing nodes are marked as LEAVING. +Node down +A node needs to be explicitly taken out of the cluster. If a node goes down and is unavailable the clients performing reads will be served an error from the replica for the shard range that the node owns. During this time it will rely on reads from other replicas to continue uninterrupted operation. +Node remove +When a node is removed the shards it owns are assigned to existing nodes in the cluster. Remaining servers discover they are now in possession of shards that are INITIALIZING and need to be bootstrapped and will begin bootstrapping the data using all replicas available. + diff --git a/docs-beta/content/operational_guides/managing_m3db/namespace_mgmt.md b/docs-beta/content/operational_guides/managing_m3db/namespace_mgmt.md index aaad933496..25afb8416a 100644 --- a/docs-beta/content/operational_guides/managing_m3db/namespace_mgmt.md +++ b/docs-beta/content/operational_guides/managing_m3db/namespace_mgmt.md @@ -1,6 +1,8 @@ --- -title: "Namespace_mgmt" +title: "Namespace management" date: 2020-04-21T20:58:51-04:00 draft: true --- +Namespaces +All namespaces that you wish to query from must be configured when setting up M3DB. If you wish to add or change an existing namespace, please follow the namespace operational guide here. diff --git a/docs-beta/content/operational_guides/managing_query.md b/docs-beta/content/operational_guides/managing_query.md index a221ef0299..0798529469 100644 --- a/docs-beta/content/operational_guides/managing_query.md +++ b/docs-beta/content/operational_guides/managing_query.md @@ -1,6 +1,286 @@ --- -title: "Managing_query" +title: "Managing M3 Query" date: 2020-04-21T20:59:17-04:00 draft: true --- +M3DB => M3 Query Blocks +In order to convert M3DB blocks into M3 Query blocks, we need to consolidate across different namespaces. In short, M3DB namespaces are essentially different resolutions that metrics are stored at. For example, a metric might be stored at both 1min and 10min resolutions- meaning this metric is found in two namespaces. +At a high level, M3DB returns to M3 Query SeriesBlocks that contain a list of SeriesIterators for a given timeseries per namespace. M3 Query then aligns the blocks across common time bounds before applying consolidation. +For example, let's say we have a query that returns two timeseries from two different namespaces- 1min and 10min. When we create the M3 Query Block, in order to accurately consolidate results from these two namespaces, we need to convert everything to have a 10min resolution. Otherwise it will not be possible to perform correctly apply functions. +Coming Soon: More documentation on how M3 Query applies consolidation. +Fetching and querying +Fetch fanout +Since m3query does not currently have a view into the M3DB index, fanout to multiple clusters is rather complicated. Since not every metric is necessarily in every cluster (as an example, carbon metrics routed to a certain resolution), it is not trivial to determine which namespaces should be queried to return a fully correct set of recorded metrics. +The general approach is therefore to attempt to fanout to any namespace which has a complete view of all metrics, for example, Unaggregated, and take that if it fulfills the query range; if not, m3query will attempt to stitch together namespaces with longer retentions to try and build the most complete possible view of stored metrics. +For further details, please ask questions on our gitter, and we'll be happy to help! + +Function Processing +Supported Functions +M3QL +Prometheus +Graphite +abs/absolute +abs() +absolute(seriesList) +alias [alias] + + +alias(seriesList, newName) +aliasByTags [tag] + + +aliasByTags(seriesList, *tags) +aliasByBucket/aliasByHistogramBucket [tag] + + + + +anomalies [flags] + + + + +asPercent +/ +asPercent(seriesList, total=None, *nodes) +avg/averageSeries [tag] +avg() +averageSeries(*seriesLists) +changed + + +changed(seriesList) +constantLine [value] + + +constantLine(value) +count +count() +countSeries(*seriesLists) +derivative + + +derivative(seriesList) +diff +- +diffSeries(*seriesLists) +divideSeries +/ +divideSeries(dividendSeriesList, divisorSeries) +eq/== [value] +== +removeBelowValue(seriesList, n)/removeAboveValue(seriesList, n) +ne/!= [value] +!= +removeBelowValue(seriesList, n)/removeAboveValue(seriesList, n) +excludeByTag [tag, pattern] + + +exclude(seriesList, pattern) +execute/exec [fetch] + + + + +fallbackSeries [replacement] + + +fallbackSeries(seriesList, fallback) +fetch + + + + +ge/=> [value] +>= +removeBelowValue(seriesList, n) +gt/> [value] +> +removeBelowValue(seriesList, n) +head [limit] +topk() +highest(seriesList, n=1, func='average') +histogramCDF [idTag, rangeTag, value] + + + + +histogramPercentile [idTag, rangeTag, percentileValue] + + + + +identity [name] + + +identity(name) +integral + + +integral(seriesList) +intersect [tags] +and/or + + +isNonNull + + +isNonNull(seriesList) +jainCP + + + + +keepLastValue + + +keepLastValue(seriesList, limit=inf) +le/<= [value] +<= +removeAboveValue(seriesList, n) +logarithm +ln() +logarithm(seriesList, base=10) +lt/< [value] +< +removeAboveValue(seriesList, n) +max/maxSeries [tag] +max() +maxSeries(*seriesLists) +min/minSeries [tag] +min() +minSeries(*seriesLists) +moving [interval, func] +_over_time() +movingMax, movingMin, movingMedian, movingAverage, etc. +multiply/multiplySeries [tag] +* +multiplySeries(*seriesLists) +nonNegativeDerivative [maxValue] + + +nonNegativeDerivative(seriesList, maxValue=None) +nPercentile [percentile] + + +nPercentile(seriesList, n) +offset [amount] + + +offset(seriesList, factor) +percentileOfSeries [n, true/false, tag] + + +percentileOfSeries(seriesList, n, interpolate=False) +perSecond +rate() +perSecond(seriesList, maxValue=None) +promHistogramPercentile [percentileValue] + + + + +range [tag] + + +rangeOfSeries(*seriesLists) +removeAbovePercentile [percentile] + + +removeAbovePercentile(seriesList, n) +removeBelowPercentile [percentile] + + +removeBelowPercentile(seriesList, n) +removeAboveValue [value] + + +removeAboveValue(seriesList, n) +removeBelowValue [value] + + +removeBelowValue(seriesList, n) +removeEmpty + + +removeEmptySeries(seriesList, xFilesFactor=None) +scale [factor] + + +scale(seriesList, factor) +scaleToSeconds [seconds] + + +scaleToSeconds(seriesList, seconds) +setDiff [tags] + + + + +showAnomalyThresholds [level, model] + + + + +showTags [true/false, tagName(s)] + + + + +sort/sortSeries [avg, current, max, stddev, sum] +sort() +sortBy(seriesList, func='average', reverse=False) +stdev [points, windowTolerance] +stddev() +stdev(seriesList, points, windowTolerance=0.1) +sqrt/squareRoot +sqrt() +squareRoot(seriesList) +summarize [interval, func, alignToFrom] + + +summarize(seriesList, intervalString, func='sum', alignToFrom=False) +sum/sumSeries [tag] +sum() +sumSeries(*seriesLists) +sustain [duration] + + + + +sustainedAbove & sustainedBelow + + + + +tail [limit] +bottomk() +lowest(seriesList, n=1, func='average') +timeshift [duration] + + +timeShift(seriesList, timeShift, resetEnd=True, alignDST=False) +timestamp +timestamp() + + +transformNull [value] + + +transformNull(seriesList, default=0, referenceSeries=None) + + + + +Setting up m3query +Introduction +m3query is used to query data that is stored in M3DB. For instance, if you are using the Prometheus remote write endpoint with m3coordinator, you can use m3query instead of the Prometheus remote read endpoint. By doing so, you get all of the benefits of m3query's engine such as block processing. Furthermore, since m3query provides a Prometheus compatible API, you can use 3rd party graphing and alerting solutions like Grafana. +Configuration +Before setting up m3query, make sure that you have at least one M3DB node running. In order to start m3query, you need to configure a yaml file, that will be used to connect to M3DB. Here is a link to a sample config file that is used for an embedded etcd cluster within M3DB. +Running +You can run m3query by either building and running the binary yourself: +make m3query +./bin/m3query -f ./src/query/config/m3query-local-etcd.yml + +Or you can run it with Docker using the Docker file located at $GOPATH/src/github.com/m3db/m3/docker/m3query/Dockerfile. diff --git a/docs-beta/content/quickstart/_index.md b/docs-beta/content/quickstart/_index.md index 6d8e667d9b..51e7a65df3 100644 --- a/docs-beta/content/quickstart/_index.md +++ b/docs-beta/content/quickstart/_index.md @@ -1,13 +1,112 @@ +++ title = "Quickstart" date = 2020-04-21T20:46:17-04:00 -weight = 5 +weight = 3 chapter = true -pre = "X. " +pre = "3. " +++ ### Chapter X -# Some Chapter title +M3DB Single Node Deployment +Deploying a single-node cluster is a great way to experiment with M3DB and get a feel for what it has to offer. Our Docker image by default configures a single M3DB instance as one binary containing: +An M3DB storage instance (m3dbnode) for timeseries storage. This includes an embedded tag-based metrics index, as well as as an embedded etcd server for storing the above mentioned cluster topology and runtime configuration. +A "coordinator" instance (m3coordinator) for writing and querying tagged metrics, as well as managing cluster topology and runtime configuration. +To begin, first start up a Docker container with port 7201 (used to manage the cluster topology), port 7203 which is where Prometheus scrapes metrics produced by M3DB and M3Coordinator, and port 9003 (used to read and write metrics) exposed. We recommend you create a persistent data directory on your host for durability: +docker pull quay.io/m3db/m3dbnode:latest +docker run -p 7201:7201 -p 7203:7203 -p 9003:9003 --name m3db -v $(pwd)/m3db_data:/var/lib/m3db quay.io/m3db/m3dbnode:latest -Lorem Ipsum. \ No newline at end of file +Note: For the single node case, we use this sample config file. If you inspect the file, you'll see that all the configuration is namespaced by coordinator or db. That's because this setup runs M3DB and M3Coordinator as one application. While this is convenient for testing and development, you'll want to run clustered M3DB with a separate M3Coordinator in production. You can read more about that here.. +Next, create an initial namespace for your metrics in the database using the cURL below. Keep in mind that the provided namespaceName must match the namespace in the local section of the M3Coordinator YAML configuration, and if you choose to add any additional namespaces you'll need to add them to the local section of M3Coordinator's YAML config as well. +curl -X POST http://localhost:7201/api/v1/database/create -d '{ + "type": "local", + "namespaceName": "default", + "retentionTime": "12h" +}' + +Note: The api/v1/database/create endpoint is abstraction over two concepts in M3DB called placements and namespaces. If a placement doesn't exist, it will create one based on the type argument, otherwise if the placement already exists, it just creates the specified namespace. For now it's enough to just understand that it creates M3DB namespaces (tables), but if you're going to run a clustered M3 setup in production, make sure you familiarize yourself with the links above. +Placement initialization may take a minute or two and you can check on the status of this by running the following: +curl http://localhost:7201/api/v1/placement | jq . + +Once all of the shards become AVAILABLE, you should see your node complete bootstrapping! Don't worry if you see warnings or errors related to a local cache file, such as [W] could not load cache from file /var/lib/m3kv/m3db_embedded.json. Those are expected for a local instance and in general any warn-level errors (prefixed with [W]) should not block bootstrapping. +02:28:30.008072[I] updating database namespaces [{adds [default]} {updates []} {removals []}] +02:28:30.270681[I] node tchannelthrift: listening on 0.0.0.0:9000 +02:28:30.271909[I] cluster tchannelthrift: listening on 0.0.0.0:9001 +02:28:30.519468[I] node httpjson: listening on 0.0.0.0:9002 +02:28:30.520061[I] cluster httpjson: listening on 0.0.0.0:9003 +02:28:30.520652[I] bootstrap finished [{namespace metrics} {duration 55.4µs}] +02:28:30.520909[I] bootstrapped + +The node also self-hosts its OpenAPI docs, outlining available endpoints. You can access this by going to localhost:7201/api/v1/openapi in your browser. + +Now you can experiment with writing tagged metrics: +curl -sS -X POST http://localhost:9003/writetagged -d '{ + "namespace": "default", + "id": "foo", + "tags": [ + { + "name": "__name__", + "value": "user_login" + }, + { + "name": "city", + "value": "new_york" + }, + { + "name": "endpoint", + "value": "/request" + } + ], + "datapoint": { + "timestamp": '"$(date "+%s")"', + "value": 42.123456789 + } +} +' + +Note: In the above example we include the tag __name__. This is because __name__ is a reserved tag in Prometheus and will make querying the metric much easier. For example, if you have M3Query setup as a Prometheus datasource in Grafana, you can then query for the metric using the following PromQL query: +user_login{city="new_york",endpoint="/request"} + +And reading the metrics you've written using the M3DB /query endpoint: +curl -sS -X POST http://localhost:9003/query -d '{ + "namespace": "default", + "query": { + "regexp": { + "field": "city", + "regexp": ".*" + } + }, + "rangeStart": 0, + "rangeEnd": '"$(date "+%s")"' +}' | jq . + +{ + "results": [ + { + "id": "foo", + "tags": [ + { + "name": "__name__", + "value": "user_login" + }, + { + "name": "city", + "value": "new_york" + }, + { + "name": "endpoint", + "value": "/request" + } + ], + "datapoints": [ + { + "timestamp": 1527039389, + "value": 42.123456789 + } + ] + } + ], + "exhaustive": true +} + +Now that you've got the M3 stack up and running, take a look at the rest of our documentation to see how you can integrate with Prometheus and Graphite diff --git a/docs-beta/content/reference_docs/_index.md b/docs-beta/content/reference_docs/_index.md index 28de5d6f7c..14cee66f4a 100644 --- a/docs-beta/content/reference_docs/_index.md +++ b/docs-beta/content/reference_docs/_index.md @@ -1,9 +1,9 @@ +++ -title = "Reference_docs" +title = "Reference Documentation" date = 2020-04-21T21:00:26-04:00 -weight = 5 +weight = 7 chapter = true -pre = "X. " +pre = "7. " +++ ### Chapter X diff --git a/docs-beta/content/reference_docs/apis.md b/docs-beta/content/reference_docs/apis.md index c5b967fc65..eba0f7de6b 100644 --- a/docs-beta/content/reference_docs/apis.md +++ b/docs-beta/content/reference_docs/apis.md @@ -1,6 +1,184 @@ --- -title: "Apis" +title: "APIs" date: 2020-04-21T21:02:36-04:00 draft: true --- +M3 Coordinator, API for reading/writing metrics and M3 management +M3 Coordinator is a service that coordinates reads and writes between upstream systems, such as Prometheus, and downstream systems, such as M3DB. +It also provides management APIs to setup and configure different parts of M3. +The coordinator is generally a bridge for read and writing different types of metrics formats and a management layer for M3. + +API +The M3 Coordinator implements the Prometheus Remote Read and Write HTTP endpoints, they also can be used however as general purpose metrics write and read APIs. Any metrics that are written to the remote write API can be queried using PromQL through the query APIs as well as being able to be read back by the Prometheus Remote Read endpoint. +Remote Write +Write a Prometheus Remote write query to M3. +URL +/api/v1/prom/remote/write +Method +POST +URL Params +None. +Header Params +Optional +M3-Metrics-Type: +If this header is set, it determines what type of metric to store this metric value as. Otherwise by default, metrics will be stored in all namespaces that are configured. You can also disable this default behavior by setting downsample options to all: false for a namespace in the coordinator config, for more see disabling automatic aggregation. + +Must be one of: +unaggregated: Write metrics directly to configured unaggregated namespace. +aggregated: Write metrics directly to a configured aggregated namespace (bypassing any aggregation), this requires the M3-Storage-Policy header to be set to resolve which namespace to write metrics to. + + +M3-Storage-Policy: +If this header is set, it determines which aggregated namespace to read/write metrics directly to/from (bypassing any aggregation). +The value of the header must be in the format of resolution:retention in duration shorthand. e.g. 1m:48h specifices 1 minute resolution and 48 hour retention. Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". + +Here is an example of querying metrics from a specific namespace. +Data Params +Binary snappy compressed Prometheus WriteRequest protobuf message. +Available Tuning Params +Refer here for an up to date list of remote tuning parameters. +Sample Call +There isn't a straightforward way to Snappy compress and marshal a Prometheus WriteRequest protobuf message using just shell, so this example uses a specific command line utility instead. +This sample call is made using promremotecli which is a command line tool that uses a Go client to Prometheus Remote endpoints. For more information visit the GitHub repository. +There is also a Java client that can be used to make requests to the endpoint. +Each -t parameter specifies a label (dimension) to add to the metric. +The -h parameter can be used as many times as necessary to add headers to the outgoing request in the form of "Header-Name: HeaderValue". +Here is an example of writing the datapoint at the current unix timestamp with value 123.456: +docker run -it --rm \ + quay.io/m3db/prometheus_remote_client_golang:latest \ + -u http://host.docker.internal:7201/api/v1/prom/remote/write \ + -t __name__:http_requests_total \ + -t code:200 \ + -t handler:graph \ + -t method:get \ + -d $(date +"%s"),123.456 +promremotecli_log 2019/06/25 04:13:56 writing datapoint [2019-06-25 04:13:55 +0000 UTC 123.456] +promremotecli_log 2019/06/25 04:13:56 labelled [[__name__ http_requests_total] [code 200] [handler graph] [method get]] +promremotecli_log 2019/06/25 04:13:56 writing to http://host.docker.internal:7201/api/v1/prom/remote/write +{"success":true,"statusCode":200} +promremotecli_log 2019/06/25 04:13:56 write success + +# If you are paranoid about image tags being hijacked/replaced with nefarious code, you can use this SHA256 tag: +# quay.io/m3db/prometheus_remote_client_golang@sha256:fc56df819bff9a5a087484804acf3a584dd4a78c68900c31a28896ed66ca7e7b + +For more details on querying data in PromQL that was written using this endpoint, see the query API documentation. +Remote Read +Read Prometheus metrics from M3. +URL +/api/v1/prom/remote/read +Method +POST +URL Params +None. +Header Params +Optional +M3-Metrics-Type: +If this header is set, it determines what type of metric to store this metric value as. Otherwise by default, metrics will be stored in all namespaces that are configured. You can also disable this default behavior by setting downsample options to all: false for a namespace in the coordinator config, for more see disabling automatic aggregation. + +Must be one of: +unaggregated: Write metrics directly to configured unaggregated namespace. +aggregated: Write metrics directly to a configured aggregated namespace (bypassing any aggregation), this requires the M3-Storage-Policy header to be set to resolve which namespace to write metrics to. + + +M3-Storage-Policy: +If this header is set, it determines which aggregated namespace to read/write metrics directly to/from (bypassing any aggregation). +The value of the header must be in the format of resolution:retention in duration shorthand. e.g. 1m:48h specifices 1 minute resolution and 48 hour retention. Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". + +Here is an example of querying metrics from a specific namespace. +Data Params +Binary snappy compressed Prometheus WriteRequest protobuf message. + +Query Engine + +API +Please note: This documentation is a work in progress and more detail is required. +Query using PromQL +Query using PromQL and returns JSON datapoints compatible with the Prometheus Grafana plugin. +URL +/api/v1/query_range +Method +GET +URL Params +Required +start=[time in RFC3339Nano] +end=[time in RFC3339Nano] +step=[time duration] +target=[string] +Optional +debug=[bool] +lookback=[string|time duration]: This sets the per request lookback duration to something other than the default set in config, can either be a time duration or the string "step" which sets the lookback to the same as the step request parameter. +Header Params +Optional +M3-Metrics-Type: +If this header is set, it determines what type of metric to store this metric value as. Otherwise by default, metrics will be stored in all namespaces that are configured. You can also disable this default behavior by setting downsample options to all: false for a namespace in the coordinator config, for more see disabling automatic aggregation. + +Must be one of: +unaggregated: Write metrics directly to configured unaggregated namespace. +aggregated: Write metrics directly to a configured aggregated namespace (bypassing any aggregation), this requires the M3-Storage-Policy header to be set to resolve which namespace to write metrics to. + + +M3-Storage-Policy: +If this header is set, it determines which aggregated namespace to read/write metrics directly to/from (bypassing any aggregation). +The value of the header must be in the format of resolution:retention in duration shorthand. e.g. 1m:48h specifices 1 minute resolution and 48 hour retention. Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". + +Here is an example of querying metrics from a specific namespace. +Tag Mutation +The M3-Map-Tags-JSON header enables dynamically mutating tags in Prometheus write request. See 2254 for more background. +Currently only write is supported. As an example, the following header would unconditionally cause globaltag=somevalue to be added to all metrics in a write request: +M3-Map-Tags-JSON: '{"tagMappers":[{"write":{"tag":"globaltag","value":"somevalue"}}]}' + +Data Params +None. +Sample Call +curl 'http://localhost:7201/api/v1/query_range?query=abs(http_requests_total)&start=1530220860&end=1530220900&step=15s' +{ + "status": "success", + "data": { + "resultType": "matrix", + "result": [ + { + "metric": { + "code": "200", + "handler": "graph", + "method": "get" + }, + "values": [ + [ + 1530220860, + "6" + ], + [ + 1530220875, + "6" + ], + [ + 1530220890, + "6" + ] + ] + }, + { + "metric": { + "code": "200", + "handler": "label_values", + "method": "get" + }, + "values": [ + [ + 1530220860, + "6" + ], + [ + 1530220875, + "6" + ], + [ + 1530220890, + "6" + ] + ] + } + ] + } +} diff --git a/docs-beta/content/reference_docs/architecture.md b/docs-beta/content/reference_docs/architecture.md deleted file mode 100644 index 4e5b8c1784..0000000000 --- a/docs-beta/content/reference_docs/architecture.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "Architecture" -date: 2020-04-21T21:00:49-04:00 -draft: true ---- - diff --git a/docs-beta/content/reference_docs/architecture/aggregator.md b/docs-beta/content/reference_docs/architecture/aggregator.md index 79b4964f9b..2b22c1080d 100644 --- a/docs-beta/content/reference_docs/architecture/aggregator.md +++ b/docs-beta/content/reference_docs/architecture/aggregator.md @@ -1,5 +1,5 @@ --- -title: "Aggregator" +title: "M3 Aggregator" date: 2020-04-21T21:01:14-04:00 draft: true --- diff --git a/docs-beta/content/reference_docs/architecture/coordinator.md b/docs-beta/content/reference_docs/architecture/coordinator.md index 1a30f14174..bd4007ed04 100644 --- a/docs-beta/content/reference_docs/architecture/coordinator.md +++ b/docs-beta/content/reference_docs/architecture/coordinator.md @@ -1,5 +1,5 @@ --- -title: "Coordinator" +title: "M3 Coordinator" date: 2020-04-21T21:01:05-04:00 draft: true --- diff --git a/docs-beta/content/reference_docs/architecture/m3db.md b/docs-beta/content/reference_docs/architecture/m3db.md index 72ff137cc6..af67c020d8 100644 --- a/docs-beta/content/reference_docs/architecture/m3db.md +++ b/docs-beta/content/reference_docs/architecture/m3db.md @@ -1,6 +1,45 @@ --- -title: "M3db" +title: "M3DB" date: 2020-04-21T21:00:55-04:00 draft: true --- +M3DB is a persistent database with durable storage, but it is best understood via the boundary between its in-memory object layout and on-disk representations. +In-Memory Object Layout + ┌────────────────────────────────────────────────────────────┐ + │ Database │ + ├────────────────────────────────────────────────────────────┤ + │ │ + │ ┌────────────────────────────────────────────────────┐ │ + │ │ Namespaces │ │ + │ ├────────────────────────────────────────────────────┤ │ + │ │ │ │ + │ │ ┌────────────────────────────────────────────┐ │ │ + │ │ │ Shards │ │ │ + │ │ ├────────────────────────────────────────────┤ │ │ + │ │ │ │ │ │ + │ │ │ ┌────────────────────────────────────┐ │ │ │ + │ │ │ │ Series │ │ │ │ + │ │ │ ├────────────────────────────────────┤ │ │ │ + │ │ │ │ │ │ │ │ + │ │ │ │ ┌────────────────────────────┐ │ │ │ │ + │ │ │ │ │ Buffer │ │ │ │ │ + │ │ │ │ └────────────────────────────┘ │ │ │ │ + │ │ │ │ │ │ │ │ + │ │ │ │ │ │ │ │ + │ │ │ │ ┌────────────────────────────┐ │ │ │ │ + │ │ │ │ │ Cached blocks │ │ │ │ │ + │ │ │ │ └────────────────────────────┘ │ │ │ │ + │ │ │ │ ... │ │ │ │ + │ │ │ │ │ │ │ │ + │ │ │ └────────────────────────────────────┘ │ │ │ + │ │ │ ... │ │ │ + │ │ │ │ │ │ + │ │ └────────────────────────────────────────────┘ │ │ + │ │ ... │ │ + │ │ │ │ + │ └────────────────────────────────────────────────────┘ │ + │ ... │ + │ │ + └────────────────────────────────────────────────────────────┘ + diff --git a/docs-beta/content/reference_docs/architecture/query.md b/docs-beta/content/reference_docs/architecture/query.md index 6f0986412a..806145ab8b 100644 --- a/docs-beta/content/reference_docs/architecture/query.md +++ b/docs-beta/content/reference_docs/architecture/query.md @@ -1,6 +1,54 @@ --- -title: "Query" +title: "M3 Query" date: 2020-04-21T21:00:59-04:00 draft: true --- +Architecture +Please note: This documentation is a work in progress and more detail is required. +Overview +M3 Query and M3 Coordinator are written entirely in Go, M3 Query is as a query engine for M3DB and M3 Coordinator is a remote read/write endpoint for Prometheus and M3DB. To learn more about Prometheus's remote endpoints and storage, see here. + +Blocks +Please note: This documentation is a work in progress and more detail is required. +Overview +The fundamental data structures that M3 Query uses are Blocks. Blocks are what get created from the series iterators that M3DB returns. A Block is associated with a start and end time. It contains data from multiple time series stored in columnar format. +Most transformations within M3 Query will be applied across different series for each time interval. Therefore, having data stored in columnar format helps with the memory locality of the data. Moreover, most transformations within M3 Query can work in parallel on different blocks which can significantly increase the computation speed. +Diagram +Below is a visual representation of a set of Blocks. On top is the M3QL query that gets executed, and on the bottom, are the results of the query containing 3 different Blocks. + ┌───────────────────────────────────────────────────────────────────────┐ + │ │ + │ fetch name:sign_up city_id:{new_york,san_diego,toronto} os:* │ + │ │ + └───────────────────────────────────────────────────────────────────────┘ + │ │ │ + │ │ │ + │ │ │ + ▼ ▼ ▼ + ┌────────────┐ ┌────────────┐ ┌─────────────┐ + │ Block One │ │ Block Two │ │ Block Three │ + └────────────┘ └────────────┘ └─────────────┘ + ┌──────┬──────┬──────┐ ┌──────┬──────┬──────┐ ┌──────┬──────┬──────┐ + │ t │ t+1 │ t+2 │ │ t+3 │ t+4 │ t+5 │ │ t+6 │ t+7 │ t+8 │ + ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ +┌───────────────────────────┐ │ │ │ │ │ │ │ │ │ │ │ │ +│ name:sign_up │ │ │ │ │ │ │ │ │ │ │ │ │ +│ city_id:new_york os:ios │ │ 5 │ 2 │ 10 │ │ 10 │ 2 │ 10 │ │ 5 │ 3 │ 5 │ +└───────────────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ + ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ +┌───────────────────────────┐ │ │ │ │ │ │ │ │ │ │ │ │ +│ name:sign_up │ │ │ │ │ │ │ │ │ │ │ │ │ +│city_id:new_york os:android│ │ 10 │ 8 │ 5 │ │ 20 │ 4 │ 5 │ │ 10 │ 8 │ 5 │ +└───────────────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ + ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ +┌───────────────────────────┐ │ │ │ │ │ │ │ │ │ │ │ │ +│ name:sign_up │ │ │ │ │ │ │ │ │ │ │ │ │ +│ city_id:san_diego os:ios │ │ 10 │ 5 │ 10 │ │ 2 │ 5 │ 10 │ │ 8 │ 6 │ 6 │ +└───────────────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ + ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ ├──────┼──────┼──────▶ +┌───────────────────────────┐ │ │ │ │ │ │ │ │ │ │ │ │ +│ name:sign_up │ │ │ │ │ │ │ │ │ │ │ │ │ +│ city_id:toronto os:ios │ │ 2 │ 5 │ 10 │ │ 2 │ 5 │ 10 │ │ 2 │ 5 │ 10 │ +└───────────────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ + └──────┴──────┴──────┘ └──────┴──────┴──────┘ └──────┴──────┴──────┘ + diff --git a/docs-beta/content/reference_docs/configurations.md b/docs-beta/content/reference_docs/configurations.md deleted file mode 100644 index 584c9ba11f..0000000000 --- a/docs-beta/content/reference_docs/configurations.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "Configurations" -date: 2020-04-21T21:01:23-04:00 -draft: true ---- - diff --git a/docs-beta/content/reference_docs/configurations/annotated_config.md b/docs-beta/content/reference_docs/configurations/annotated_config.md index a926b5d1be..e565d4cfbc 100644 --- a/docs-beta/content/reference_docs/configurations/annotated_config.md +++ b/docs-beta/content/reference_docs/configurations/annotated_config.md @@ -1,5 +1,5 @@ --- -title: "Annotated_config" +title: "Annotated configuration file" date: 2020-04-21T21:01:32-04:00 draft: true --- diff --git a/docs-beta/content/reference_docs/configurations/availability.md b/docs-beta/content/reference_docs/configurations/availability.md index db16ded93b..57040f5718 100644 --- a/docs-beta/content/reference_docs/configurations/availability.md +++ b/docs-beta/content/reference_docs/configurations/availability.md @@ -1,6 +1,87 @@ --- -title: "Availability" +title: "Availability, consistency, and durability" date: 2020-04-21T21:02:08-04:00 draft: true --- +Consistency Levels +M3DB provides variable consistency levels for read and write operations, as well as cluster connection operations. These consistency levels are handled at the client level. +Write consistency levels +One: Corresponds to a single node succeeding for an operation to succeed. +Majority: Corresponds to the majority of nodes succeeding for an operation to succeed. +All: Corresponds to all nodes succeeding for an operation to succeed. +Read consistency levels +One: Corresponds to reading from a single node to designate success. +UnstrictMajority: Corresponds to reading from the majority of nodes but relaxing the constraint when it cannot be met, falling back to returning success when reading from at least a single node after attempting reading from the majority of nodes. +Majority: Corresponds to reading from the majority of nodes to designate success. +All: Corresponds to reading from all of the nodes to designate success. +Connect consistency levels +Connect consistency levels are used to determine when a client session is deemed as connected before operations can be attempted. +Any: Corresponds to connecting to any number of nodes for all shards, this strategy will attempt to connect to all, then the majority, then one and then fallback to none and as such will always succeed. +None: Corresponds to connecting to no nodes for all shards and as such will always succeed. +One: Corresponds to connecting to a single node for all shards. +Majority: Corresponds to connecting to the majority of nodes for all shards. +All: Corresponds to connecting to all of the nodes for all shards. + + +Tuning Availability, Consistency, and Durability +Overview +M3DB is designed as a High Availability HA system because it doesn't use a consensus protocol like Raft or Paxos to enforce strong consensus and consistency guarantees. However, even within the category of HA systems, there is a broad spectrum of consistency and durability guarantees that a database can provide. To address as many use cases as possible, M3DB can be tuned to achieve the desired balance between performance, availability, durability, and consistency. +Generally speaking, the default and example configuration for M3DB favors performance and availability, as that is well-suited for M3DB's most common metrics and Observability use cases. To instead favor consistency and durability, consider tuning values as described in the "Tuning for Consistency and Durability" section. Database operators who are using M3DB for workloads that require stricter consistency and durability guarantees should consider tuning the default configuration to better suit their use case. +The rest of this document describes the various configuration options that are available to M3DB operators to make such tradeoffs. While reading it, we recommend referring to the default configuration file (which has every possible configuration value set) to see how the described values fit into M3DB's configuration as a whole. +Tuning for Performance and Availability +Client Write and Read consistency +We recommend running the client with writeConsistencyLevel set to majority and readConsistencyLevel set to unstrict_majority. This means that all write must be acknowledged by a quorums of nodes in order to be considered succesful, and that reads will attempt to achieve quorum, but will return the data from a single node if they are unable to achieve quorum. This ensures that reads will normally ensure consistency, but degraded conditions will cause reads to fail outright as long as at least a single node can satisfy the request. +You can read about the consistency levels in more detail in the Consistency Levels section +Commitlog Configuration +We recommend running M3DB with an asynchronous commitlog. This means that writes will be reported as successful by the client, though the data may not have been flushed to disk yet. +For example, consider the default configuration: +commitlog: + flushMaxBytes: 524288 + flushEvery: 1s + queue: + calculationType: fixed + size: 2097152 + +This configuration states that the commitlog should be flushed whenever either of the following is true: +524288 or more bytes have been written since the last time M3DB flushed the commitlog. +One or more seconds has elapsed since the last time M3DB flushed the commitlog. +In addition, the configuration also states that M3DB should allow up to 2097152 writes to be buffered in the commitlog queue before the database node will begin rejecting incoming writes so it can attempt to drain the queue and catch up. Increasing the size of this queue can often increase the write throughput of an M3DB node at the cost of potentially losing more data if the node experiences a sudden failure like a hard crash or power loss. +Writing New Series Asynchronously +The default M3DB YAML configuration will contain the following as a top-level key under the db section: +writeNewSeriesAsync: true + +This instructs M3DB to handle writes for new timeseries (for a given time block) asynchronously. Creating a new timeseries in memory is much more expensive than simply appending a new write to an existing series, so the default configuration of creating them asynchronously improves M3DBs write throughput significantly when many new series are being created all at once. +However, since new time series are created asynchronously, it's possible that there may be a brief delay inbetween when a write is acknowledged by the client and when that series becomes available for subsequent reads. +M3DB also allows operators to rate limit the number of new series that can be created per second via the following configuration: +writeNewSeriesLimitPerSecond: 1048576 + +This value can be set much lower than the default value for workloads in which a significant increase in cardinality usually indicates a misbehaving caller. +Ignoring Corrupt Commitlogs on Bootstrap +If M3DB is shut down gracefully (i.e via SIGTERM), it will ensure that all pending writes are flushed to the commitlog on disk before the process exists. However, in situations where the process crashed/exited unexpectedly or the node itself experienced a sudden failure, the tail end of the commitlog may be corrupt. In such situations, M3DB will read as much of the commitlog as possible in an attempt to recover the maximum amount of data. However, it then needs to make a decision: it can either (a) come up successfully and tolerate an ostensibly minor amount of data or loss, or (b) attempt to stream the missing data from its peers. This behavior is controlled by the following default configuration: +bootstrap: + commitlog: + returnUnfulfilledForCorruptCommitLogFiles: false + +In the situation where only a single node fails, the optimal outcome is for the node to attempt to repair itself from one of its peers. However, if a quorum of nodes fail and encounter corrupt commitlog files, they will deadlock while attempting to stream data from each other, as no nodes will be able to make progress due to a lack of quorum. This issue requires an operator with significant M3DB operational experience to manually bootstrap the cluster; thus the official recommendation is to set returnUnfulfilledForCorruptCommitLogFiles: false to avoid this issue altogether. In most cases, a small amount of data loss is preferable to a quorum of nodes that crash and fail to start back up automatically. +Tuning for Consistency and Durability +Client Write and Read consistency +The most important thing to understand is that if you want to guarantee that you will be able to read the result of every successful write, then both writes and reads must be done with majority consistency. This means that both writes and reads will fail if a quorum of nodes are unavailable for a given shard. You can read about the consistency levels in more detail in the Consistency Levels section +Commitlog Configuration +M3DB supports running the commitlog synchronously such that every write is flushed to disk and fsync'd before the client receives a successful acknowledgement, but this is not currently exposed to users in the YAML configuration and generally leads to a massive performance degradation. We only recommend operating M3DB this way for workloads where data consistency and durability is strictly required, and even then there may be better alternatives such as running M3DB with the bootstrapping configuration: filesystem,peers,uninitialized_topology as described in our bootstrapping operational guide. +Writing New Series Asynchronously +If you want to guarantee that M3DB will immediately allow you to read data for writes that have been acknowledged by the client, including the situation where the previous write was for a brand new timeseries, then you will need to change the default M3DB configuration to set writeNewSeriesAsync: false as a top-level key under the db section: +writeNewSeriesAsync: false + +This instructs M3DB to handle writes for new timeseries (for a given time block) synchronously. Creating a new timeseries in memory is much more expensive than simply appending a new write to an existing series, so this configuration could have an adverse effect on performance when many new timeseries are being inserted into M3DB concurrently. +Since this operation is so expensive, M3DB allows operator to rate limit the number of new series that can be created per second via the following configuration (also a top-level key under the db section): +writeNewSeriesLimitPerSecond: 1048576 + +Ignoring Corrupt Commitlogs on Bootstrap +As described in the "Tuning for Performance and Availability" section, we recommend configuring M3DB to ignore corrupt commitlog files on bootstrap. However, if you want to avoid any amount of inconsistency or data loss, no matter how minor, then you should configure M3DB to return unfulfilled when the commitlog bootstrapper encounters corrupt commitlog files. You can do so by modifying your configuration to look like this: +bootstrap: + commitlog: + returnUnfulfilledForCorruptCommitLogFiles: true + +This will force your M3DB nodes to attempt to repair corrupted commitlog files on bootstrap by streaming the data from their peers. In most situations this will be transparent to the operator and the M3DB node will finish bootstrapping without trouble. However, in the scenario where a quorum of nodes for a given shard failed in unison, the nodes will deadlock while attempting to stream data from each other, as no nodes will be able to make progress due to a lack of quorum. This issue requires an operator with significant M3DB operational experience to manually bootstrap the cluster; thus the official recommendation is to avoid configuring M3DB in this way unless data consistency and durability are of utmost importance. + diff --git a/docs-beta/content/reference_docs/configurations/bootstrapping.md b/docs-beta/content/reference_docs/configurations/bootstrapping.md index 94edbea6af..2f129d3d7f 100644 --- a/docs-beta/content/reference_docs/configurations/bootstrapping.md +++ b/docs-beta/content/reference_docs/configurations/bootstrapping.md @@ -4,3 +4,100 @@ date: 2020-04-21T21:02:17-04:00 draft: true --- +Bootstrapping & Crash Recovery +Introduction +We recommend reading the placement operational guide before reading the rest of this document. +When an M3DB node is turned on (goes through a placement change) it needs to go through a bootstrapping process to determine the integrity of data that it has, replay writes from the commit log, and/or stream missing data from its peers. In most cases, as long as you're running with the default and recommended bootstrapper configuration of: filesystem,commitlog,peers,uninitialized_topology then you should not need to worry about the bootstrapping process at all and M3DB will take care of doing the right thing such that you don't lose data and consistency guarantees are met. Note that the order of the configured bootstrappers does matter. +Generally speaking, we recommend that operators do not modify the bootstrappers configuration, but in the rare case that you to, this document is designed to help you understand the implications of doing so. +M3DB currently supports 5 different bootstrappers: +filesystem +commitlog +peers +uninitialized_topology +noop-all +When the bootstrapping process begins, M3DB nodes need to determine two things: +What shards the bootstrapping node should bootstrap, which can be determined from the cluster placement. +What time-ranges the bootstrapping node needs to bootstrap those shards for, which can be determined from the namespace retention. +For example, imagine a M3DB node that is responsible for shards 1, 5, 13, and 25 according to the cluster placement. In addition, it has a single namespace called "metrics" with a retention of 48 hours. When the M3DB node is started, the node will determine that it needs to bootstrap shards 1, 5, 13, and 25 for the time range starting at the current time and ending 48 hours ago. In order to obtain all this data, it will run the configured bootstrappers in the specified order. Every bootstrapper will notify the bootstrapping process of which shard/ranges it was able to bootstrap and the bootstrapping process will continue working its way through the list of bootstrappers until all the shards/ranges required have been marked as fulfilled. Otherwise the M3DB node will fail to start. +Bootstrappers +Filesystem Bootstrapper +The filesystem bootstrapper's responsibility is to determine which immutable Fileset files exist on disk, and if so, mark them as fulfilled. The filesystem bootstrapper achieves this by scanning M3DB's directory structure and determining which Fileset files exist on disk. Unlike the other bootstrappers, the filesystem bootstrapper does not need to load any data into memory, it simply verifies the checksums of the data on disk and other components of the M3DB node will handle reading (and caching) the data dynamically once it begins to serve reads. +Commitlog Bootstrapper +The commitlog bootstrapper's responsibility is to read the commitlog and snapshot (compacted commitlogs) files on disk and recover any data that has not yet been written out as an immutable Fileset file. Unlike the filesystem bootstrapper, the commit log bootstrapper cannot simply check which files are on disk in order to determine if it can satisfy a bootstrap request. Instead, the commitlog bootstrapper determines whether it can satisfy a bootstrap request using a simple heuristic. +On a shard-by-shard basis, the commitlog bootstrapper will consult the cluster placement to see if the node it is running on has ever achieved the Available status for the specified shard. If so, then the commit log bootstrapper should have all the data since the last Fileset file was flushed and will return that it can satisfy any time range for that shard. In other words, the commit log bootstrapper is all-or-nothing for a given shard: it will either return that it can satisfy any time range for a given shard or none at all. In addition, the commitlog bootstrapper assumes it is running after the filesystem bootstrapper. M3DB will not allow you to run with a configuration where the filesystem bootstrapper is placed after the commitlog bootstrapper, but it will allow you to run the commitlog bootstrapper without the filesystem bootstrapper which can result in loss of data, depending on the workload. +Peers Bootstrapper +The peers bootstrapper's responsibility is to stream in data for shard/ranges from other M3DB nodes (peers) in the cluster. This bootstrapper is only useful in M3DB clusters with more than a single node and where the replication factor is set to a value larger than 1. The peers bootstrapper will determine whether or not it can satisfy a bootstrap request on a shard-by-shard basis by consulting the cluster placement and determining if there are enough peers to satisfy the bootstrap request. For example, imagine the following M3DB placement where node A is trying to perform a peer bootstrap: + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ Node A │ │ Node B │ │ Node C │ +────┴─────────────────┴──────────┴─────────────────┴────────┴─────────────────┴─── +┌─────────────────────────┐ ┌───────────────────────┐ ┌──────────────────────┐ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ Shard 1: Initializing │ │ Shard 1: Initializing │ │ Shard 1: Available │ +│ Shard 2: Initializing │ │ Shard 2: Initializing │ │ Shard 2: Available │ +│ Shard 3: Initializing │ │ Shard 3: Initializing │ │ Shard 3: Available │ +│ │ │ │ │ │ +│ │ │ │ │ │ +└─────────────────────────┘ └───────────────────────┘ └──────────────────────┘ + +In this case, the peers bootstrapper running on node A will not be able to fullfill any requests because node B is in the Initializing state for all of its shards and cannot fulfill bootstrap requests. This means that node A's peers bootstrapper cannot meet its default consistency level of majority for bootstrapping (1 < 2 which is majority with a replication factor of 3). On the other hand, node A would be able to peer bootstrap its shards in the following placement because its peers (nodes B/C) have sufficient replicas of the shards it needs in the Available state: + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ Node A │ │ Node B │ │ Node C │ +────┴─────────────────┴──────────┴─────────────────┴────────┴─────────────────┴─── +┌─────────────────────────┐ ┌───────────────────────┐ ┌──────────────────────┐ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ Shard 1: Initializing │ │ Shard 1: Available │ │ Shard 1: Available │ +│ Shard 2: Initializing │ │ Shard 2: Available │ │ Shard 2: Available │ +│ Shard 3: Initializing │ │ Shard 3: Available │ │ Shard 3: Available │ +│ │ │ │ │ │ +│ │ │ │ │ │ +└─────────────────────────┘ └───────────────────────┘ └──────────────────────┘ + +Note that a bootstrap consistency level of majority is the default value, but can be modified by changing the value of the key m3db.client.bootstrap-consistency-level in etcd to one of: none, one, unstrict_majority (attempt to read from majority, but settle for less if any errors occur), majority (strict majority), and all. For example, if an entire cluster with a replication factor of 3 was restarted simultaneously, all the nodes would get stuck in an infinite loop trying to peer bootstrap from each other and not achieving majority until an operator modified this value. Note that this can happen even if all the shards were in the Available state because M3DB nodes will reject all read requests for a shard until they have bootstrapped that shard (which has to happen everytime the node is restarted). +Note: Any bootstrappers configuration that does not include the peers bootstrapper will be unable to handle dynamic placement changes of any kind. +Uninitialized Topology Bootstrapper +The purpose of the uninitialized_topology bootstrapper is to succeed bootstraps for all time ranges for shards that have never been completely bootstrapped (at a cluster level). This allows us to run the default bootstrapper configuration of: filesystem,commitlog,peers,topology_uninitialized such that the filesystem and commitlog bootstrappers are used by default in node restarts, the peers bootstrapper is used for node adds/removes/replaces, and bootstraps still succeed for brand new placement where both the commitlog and peers bootstrappers will be unable to succeed any bootstraps. In other words, the uninitialized_topology bootstrapper allows us to place the commitlog bootstrapper before the peers bootstrapper and still succeed bootstraps with brand new placements without resorting to using the noop-all bootstrapper which suceeds bootstraps for all shard/time-ranges regardless of the status of the placement. +The uninitialized_topology bootstrapper determines whether a placement is "new" for a given shard by counting the number of nodes in the Initializing state and Leaving states and there are more Initializing than Leaving, then it succeeds the bootstrap because that means the placement has never reached a state where all nodes are Available. +No Operational All Bootstrapper +The noop-all bootstrapper succeeds all bootstraps regardless of requests shards/time ranges. +Bootstrappers Configuration +Now that we've gone over the various bootstrappers, let's consider how M3DB will behave in different configurations. Note that we include uninitialized_topology at the end of all the lists of bootstrappers because its required to get a new placement up and running in the first place, but is not required after that (although leaving it in has no detrimental effects). Also note that any configuration that does not include the peers bootstrapper will not be able to handle dynamic placement changes like node adds/removes/replaces. +filesystem,commitlog,peers,uninitialized_topology (default) +This is the default bootstrappers configuration for M3DB and will behave "as expected" in the sense that it will maintain M3DB's consistency guarantees at all times, handle node adds/replaces/removes correctly, and still work with brand new placements / topologies. This is the only configuration that we recommend using in production. +In the general case, the node will use only the filesystem and commitlog bootstrappers on node startup. However, in the case of a node add/remove/replace, the commitlog bootstrapper will detect that it is unable to fulfill the bootstrap request (because the node has never reached the Available state) and defer to the peers bootstrapper to stream in the data. +Additionally, if it is a brand new placement where even the peers bootstrapper cannot fulfill the bootstrap, this will be detected by the uninitialized_topology bootstrapper which will succeed the bootstrap. +filesystem,peers,uninitialized_topology (default) +Everytime a node is restarted it will attempt to stream in all of the the data for any blocks that it has never flushed, which is generally the currently active block and possibly the previous block as well. This mode can be useful if you want to improve performance or save disk space by operating nodes without a commitlog, or want to force a repair of any unflushed blocks. This mode can lead to violations of M3DB's consistency guarantees due to the fact that commit logs are being ignored. In addition, if you lose a replication factors worth or more of hosts at the same time, the node will not be able to bootstrap unless an operator modifies the bootstrap consistency level configuration in etcd (see peers bootstrap section above). Finally, this mode adds additional network and resource pressure on other nodes in the cluster while one node is peer bootstrapping from them which can be problematic in catastrophic scenarios where all the nodes are trying to stream data from each other. +peers,uninitialized_topology +Every time a node is restarted, it will attempt to stream in all of the data that it is responsible for from its peers, completely ignoring the immutable Fileset files it already has on disk. This mode can be useful if you want to improve performance or save disk space by operating nodes without a commitlog, or want to force a repair of all data on an individual node. This mode can lead to violations of M3DB's consistency guarantees due to the fact that the commit logs are being ignored. In addition, if you lose a replication factors worth or more of hosts at the same time, the node will not be able to bootstrap unless an operator modifies the bootstrap consistency level configuration in etcd (see peers bootstrap section above). Finally, this mode adds additional network and resource pressure on other nodes in the cluster while one node is peer bootstrapping from them which can be problematic in catastrophic scenarios where all the nodes are trying to stream data from each other. +Invalid bootstrappers configuration +For the sake of completeness, we've included a short discussion below of some bootstrapping configurations that we consider "invalid" in that they are likely to lose data / violate M3DB's consistency guarantees and/or not handle placement changes in a correct way. +filesystem,commitlog,uninitialized_topology +This bootstrapping configuration will work just fine if nodes are never added/replaced/removed, but will fail when attempting a node add/replace/remove. +filesystem,uninitialized_topology +Every time a node is restarted it will utilize the immutable Fileset files its already written out to disk, but any data that it had received since it wrote out the last set of immutable files will be lost. +commitlog,uninitialized_topology +Every time a node is restarted it will read all the commit log and snapshot files it has on disk, but it will ignore all the data in the immutable Fileset files that it has already written. +Crash Recovery +NOTE: These steps should not be necessary in most cases, especially if using the default bootstrappers configuration of filesystem,commitlog,peers,uninitialized_topology. However in the case the configuration is non-default or the cluster has been down for a prolonged period of time these steps may be necessary. A good indicator would be log messages related to failing to bootstrap from peers due to consistency issues. +M3DB may require manual intervention to recover in the event of a prolonged loss of quorum. This is because the Peers Boostrapper must read from a majority of nodes owning a shard to bootstrap. +To relax this bootstrapping constraint, a value stored in etcd must be modified that corresponds to the m3db.client.bootstrap-consistency-level runtime flag. Until the coordinator supports an API for this, this must be done manually. The M3 contributors are aware of how cumbersome this is and are working on this API. +To update this value in etcd, first determine the environment the M3DB node is using. For example in this configuration, it is default_env. If using the M3DB Operator, the value will be $KUBE_NAMESPACE/$CLUSTER_NAME, where $KUBE_NAMESPACE is the name of the Kubernetes namespace the cluster is located in and $CLUSTER_NAME is the name you have assigned the cluster (such as default/my-test-cluster). +The following base64-encoded string represents a Protobuf-serialized message containing the string unstrict_majority: ChF1bnN0cmljdF9tYWpvcml0eQ==. Decode this string and place it in the following etcd key, where $ENV is the value determined above: +_kv/$ENV/m3db.client.bootstrap-consistency-level + +Note that on MacOS, base64 requires the -D flag to decode, whereas elsewhere it is likely -d. Also note the use of echo -n to ensure removal of newlines if your shell does not support the <<:/api/v1/openapi or our online API documentation. +Additionally, the following headers can be used in the namespace operations: +Cluster-Environment-Name: +This header is used to specify the cluster environment name. If not set, the default default_env is used. +Cluster-Zone-Name: +This header is used to specify the cluster zone name. If not set, the default embedded is used. +Adding a Namespace +Recommended (Easy way) +The recommended way to add a namespace to M3DB is to use our api/v1/database/namespace/create endpoint. This API abstracts over a lot of the complexity of configuring a namespace and requires only two pieces of configuration to be provided: the name of the namespace, as well as its retention. +For example, the following cURL: +curl -X POST :/api/v1/database/namespace/create -d '{ + "namespaceName": "default_unaggregated", + "retentionTime": "24h" +}' + +will create a namespace called default_unaggregated with a retention of 24 hours. All of the other namespace options will either use reasonable default values or be calculated based on the provided retentionTime. +Adding a namespace does not require restarting M3DB, but will require modifying the M3Coordinator configuration to include the new namespace, and then restarting it. +If you feel the need to configure the namespace options yourself (for performance or other reasons), read the Advanced section below. +Advanced (Hard Way) +The "advanced" API allows you to configure every aspect of the namespace that you're adding which can sometimes be helpful for development, debugging, and tuning clusters for maximum performance. Adding a namespace is a simple as using the POST api/v1/namespace API on an M3Coordinator instance. +curl -X POST :/api/v1/namespace -d '{ + "name": "default_unaggregated", + "options": { + "bootstrapEnabled": true, + "flushEnabled": true, + "writesToCommitLog": true, + "cleanupEnabled": true, + "snapshotEnabled": true, + "repairEnabled": false, + "retentionOptions": { + "retentionPeriod": "2d", + "blockSize": "2h", + "bufferFuture": "10m", + "bufferPast": "10m", + "blockDataExpiry": true, + "blockDataExpiryAfterNotAccessedPeriod": "5m" + }, + "indexOptions": { + "enabled": true, + "blockSize": "2h" + } + } +}' + +Adding a namespace does not require restarting M3DB, but will require modifying the M3Coordinator configuration to include the new namespace, and then restarting it. +Deleting a Namespace +Deleting a namespace is a simple as using the DELETE /api/v1/namespace API on an M3Coordinator instance. +curl -X DELETE :/api/v1/namespace/ +Note that deleting a namespace will not have any effect on the M3DB nodes until they are all restarted. In addition, the namespace will need to be removed from the M3Coordinator configuration and then the M3Coordinator node will need to be restarted. +Modifying a Namespace +There is currently no atomic namespace modification endpoint. Instead, you will need to delete a namespace and then add it back again with the same name, but modified settings. Review the individual namespace settings above to determine whether or not a given setting is safe to modify. For example, it is never safe to modify the blockSize of a namespace. +Also, be very careful not to restart the M3DB nodes after deleting the namespace, but before adding it back. If you do this, the M3DB nodes may detect the existing data files on disk and delete them since they are not configured to retain that namespace. +Viewing a Namespace +In order to view a namespace and its attributes, use the GET /api/v1/namespace API on a M3Coordinator instance. Additionally, for readability/debugging purposes, you can add the debug=true parameter to the URL to view block sizes, buffer sizes, etc. in duration format as opposed to nanoseconds (default). +Namespace Attributes +bootstrapEnabled +This controls whether M3DB will attempt to bootstrap the namespace on startup. This value should always be set to true unless you have a very good reason to change it as setting it to false can cause data loss when restarting nodes. +Can be modified without creating a new namespace: yes +flushEnabled +This controls whether M3DB will periodically flush blocks to disk once they become immutable. This value should always be set to true unless you have a very good reason to change it as setting it to false will cause increased memory utilization and potential data loss when restarting nodes. +Can be modified without creating a new namespace: yes +writesToCommitlog +This controls whether M3DB will includes writes to this namespace in the commitlog. This value should always be set to true unless you have a very good reason to change it as setting it to false will cause potential data loss when restarting nodes. +Can be modified without creating a new namespace: yes +snapshotEnabled +This controls whether M3DB will periodically write out snapshot files for this namespace which act as compacted commitlog files. This value should always be set to true unless you have a very good reason to change it as setting it to false will increasing bootstrapping times (reading commitlog files is slower than reading snapshot files) and increase disk utilization (snapshot files are compressed but commitlog files are uncompressed). +Can be modified without creating a new namespace: yes +repairEnabled +If enabled, the M3DB nodes will attempt to compare the data they own with the data of their peers and emit metrics about any discrepancies. This feature is experimental and we do not recommend enabling it under any circumstances. +retentionOptions +retentionPeriod +This controls the duration of time that M3DB will retain data for the namespace. For example, if this is set to 30 days, then data within this namespace will be available for querying up to 30 days after it is written. Note that this retention operates at the block level, not the write level, so its possible for individual datapoints to only be available for less than the specified retention. For example, if the blockSize was set to 24 hour and the retention was set to 30 days then a write that arrived at the very end of a 24 hour block would only be available for 29 days, but the node itself would always support querying the last 30 days worth of data. +Can be modified without creating a new namespace: yes +blockSize +This is the most important value to consider when tuning the performance of an M3DB namespace. Read the storage engine documentation for more details, but the basic idea is that larger blockSizes will use more memory, but achieve higher compression. Similarly, smaller blockSizes will use less memory, but have worse compression. In testing, good compression occurs with blocksizes containing around 720 samples per timeseries. +Can be modified without creating a new namespace: no +Below are recommendations for block size based on resolution: +Resolution +Block Size +5s +60m +15s +3h +30s +6h +1m +12h +5m +60h +bufferFuture and bufferPast +These values control how far into the future and the past (compared to the system time on an M3DB node) writes for the namespace will be accepted. For example, consider the following configuration: +bufferPast: 10m +bufferFuture: 20m +currentSystemTime: 2:35:00PM + +Now consider the following writes (all of which arrive at 2:35:00PM system time, but include datapoints with the specified timestamps): +2:25:00PM - Accepted, within the 10m bufferPast + +2:24:59PM - Rejected, outside the 10m bufferPast + +2:55:00PM - Accepted, within the 20m bufferFuture + +2:55:01PM - Rejected, outside the 20m bufferFuture + +While it may be tempting to configure bufferPast and bufferFuture to very large values to prevent writes from being rejected, this may cause performance issues. M3DB is a timeseries database that is optimized for realtime data. Out of order writes, as well as writes for times that are very far into the future or past are much more expensive and will cause additional CPU / memory pressure. In addition, M3DB cannot evict a block from memory until it is no longer mutable and large bufferPast and bufferFuture values effectively increase the amount of time that a block is mutable for which means that it must be kept in memory for a longer period of time. +Can be modified without creating a new namespace: yes +Index Options +enabled +Whether to use the built-in indexing. Must be true. +Can be modified without creating a new namespace: no +blockSize +The size of blocks (in duration) that the index uses. Should match the databases blocksize for optimal memory usage. +Can be modified without creating a new namespace: no diff --git a/docs-beta/content/reference_docs/configurations/replication.md b/docs-beta/content/reference_docs/configurations/replication.md index e987e5e4b3..f57cc1c22b 100644 --- a/docs-beta/content/reference_docs/configurations/replication.md +++ b/docs-beta/content/reference_docs/configurations/replication.md @@ -4,3 +4,165 @@ date: 2020-04-21T21:01:57-04:00 draft: true --- +Sharding +Timeseries keys are hashed to a fixed set of virtual shards. Virtual shards are then assigned to physical nodes. M3DB can be configured to use any hashing function and a configured number of shards. By default murmur3 is used as the hashing function and 4096 virtual shards are configured. +Benefits +Shards provide a variety of benefits throughout the M3DB stack: +They make horizontal scaling easier and adding / removing nodes without downtime trivial at the cluster level. +They provide more fine grained lock granularity at the memory level. +They inform the filesystem organization in that data belonging to the same shard will be used / dropped together and can be kept in the same file. + + +Replication +Logical shards are placed per virtual shard per replica with configurable isolation (zone aware, rack aware, etc). For instance, when using rack aware isolation, the set of datacenter racks that locate a replica’s data is distinct to the racks that locate all other replicas’ data. +Replication is synchronization during a write and depending on the consistency level configured will notify the client on whether a write succeeded or failed with respect to the consistency level and replication achieved. +Replica +Each replica has its own assignment of a single logical shard per virtual shard. +Conceptually it can be defined as: +Replica { + id uint32 + shards []Shard +} + +Shard state +Each shard can be conceptually defined as: +Shard { + id uint32 + assignments []ShardAssignment +} + +ShardAssignment { + host Host + state ShardState +} + +enum ShardState { + INITIALIZING, + AVAILABLE, + LEAVING +} + +Shard assignment +The assignment of shards is stored in etcd. When adding, removing or replacing a node shard goal states are assigned for each shard assigned. +For a write to appear as successful for a given replica it must succeed against all assigned hosts for that shard. That means if there is a given shard with a host assigned as LEAVING and another host assigned as INITIALIZING for a given replica writes to both these hosts must appear as successful to return success for a write to that given replica. Currently however only AVAILABLE shards count towards consistency, the work to group the LEAVING and INITIALIZING shards together when calculating a write success/error is not complete, see issue 417. +It is up to the nodes themselves to bootstrap shards when the assignment of new shards to it are discovered in the INITIALIZING state and to transition the state to AVAILABLE once bootstrapped by calling the cluster management APIs when done. Using a compare and set this atomically removes the LEAVING shard still assigned to the node that previously owned it and transitions the shard state on the new node from INITIALIZING state to AVAILABLE. +Nodes will not start serving reads for the new shard until it is AVAILABLE, meaning not until they have bootstrapped data for those shards. + + + +Replication and Deployment in Zones +Overview +M3DB supports both deploying across multiple zones in a region or deploying to a single zone with rack-level isolation. It can also be deployed across multiple regions for a global view of data, though both latency and bandwidth costs may increase as a result. +In addition, M3DB has support for automatically replicating data between isolated M3DB clusters (potentially running in different zones / regions). More details can be found in the Replication between clusters operational guide. +Replication +A replication factor of at least 3 is highly recommended for any M3DB deployment, due to the consistency levels (for both reads and writes) that require quorum in order to complete an operation. For more information on consistency levels, see the documentation concerning tuning availability, consistency and durability. +M3DB will do its best to distribute shards evenly among the availability zones while still taking each individual node's weight into account, but if some of the availability zones have less available hosts than others then each host in that zone will be responsible for more shards than hosts in the other zones and will thus be subjected to heavier load. +Replication Factor Recommendations +Running with RF=1 or RF=2 is not recommended for any multi-node use cases (testing or production). In the future such topologies may be rejected by M3DB entirely. It is also recommended to only run with an odd number of replicas. +RF=1 is not recommended as it is impossible to perform a safe upgrade or tolerate any node failures: as soon as one node is down, all writes destined for the shards it owned will fail. If the node's storage is lost (e.g. the disk fails), the data is gone forever. +RF=2, despite having an extra replica, entails many of the same problems RF=1 does. When M3DB is configured to perform quorum writes and reads (the default), as soon as a single node is down (for planned maintenance or an unplanned disruption) clients will be unable to read or write (as the quorum of 2 nodes is 2). Even if clients relax their consistency guarantees and read from the remaining serving node, users may experience flapping results depending on whether one node had data for a time window that the other did not. +Finally, it is only recommended to run with an odd number of replicas. Because the quorum size of an even-RF N is (N/2)+1, any cluster with an even replica factor N has the same failure tolerance as a cluster with RF=N-1. "Failure tolerance" is defined as the number of isolation groups you can concurrently lose nodes across. The following table demonstrates the quorum size and failure tolerance of various RF's, inspired by etcd's failure tolerance documentation. +Replica Factor +Quorum Size +Failure Tolerance +1 +1 +0 +2 +2 +0 +3 +2 +1 +4 +3 +1 +5 +3 +2 +6 +4 +2 +7 +4 +3 +Upgrading hosts in a deployment +When an M3DB node is restarted it has to perform a bootstrap process before it can serve reads. During this time the node will continue to accept writes, but will not be available for reads. +Obviously, there is also a small window of time during between when the process is stopped and then started again where it will also be unavailable for writes. +Deployment across multiple availability zones in a region +For deployment in a region, it is recommended to set the isolationGroup host attribute to the name of the availability zone a host is in. +In this configuration, shards are distributed among hosts such that each will not be placed more than once in the same availability zone. This allows an entire availability zone to be lost at any given time, as it is guaranteed to only affect one replica of data. +For example, in a multi-zone deployment with four shards spread over three availability zones: + +Typically, deployments have many more than four shards - this is a simple example that illustrates how M3DB maintains availability while losing an availability zone, as two of three replicas are still intact. +Deployment in a single zone +For deployment in a single zone, it is recommended to set the isolationGroup host attribute to the name of the rack a host is in or another logical unit that separates groups of hosts in your zone. +In this configuration, shards are distributed among hosts such that each will not be placed more than once in the same defined rack or logical unit. This allows an entire unit to be lost at any given time, as it is guaranteed to only affect one replica of data. +For example, in a single-zone deployment with three shards spread over four racks: + +Typically, deployments have many more than three shards - this is a simple example that illustrates how M3DB maintains availability while losing a single rack, as two of three replicas are still intact. +Deployment across multiple regions +For deployment across regions, it is recommended to set the isolationGroup host attribute to the name of the region a host is in. +As mentioned previously, latency and bandwidth costs may increase when using clusters that span regions. +In this configuration, shards are distributed among hosts such that each will not be placed more than once in the same region. This allows an entire region to be lost at any given time, as it is guaranteed to only affect one replica of data. +For example, in a multi-region deployment with four shards spread over five regions: + +Typically, deployments have many more than four shards - this is a simple example that illustrates how M3DB maintains availability while losing up to two regions, as three of five replicas are still intact. + +Replication between clusters (beta) +Overview +M3DB clusters can be configured to passively replicate data from other clusters. This feature is most commonly used when operators wish to run two (or more) regional clusters that function independently while passively replicating data from the other cluster in an eventually consistent manner. +The cross-cluster replication feature is built on-top of the background repairs feature. As a result, it has all the same caveats and limitations. Specifically, it does not currently work with clusters that use M3DB's indexing feature and the replication delay between two clusters will be at least (block size + bufferPast) for data written at the beginning of a block for a given namespace. For use-cases where a large replication delay is unacceptable, the current recommendation is to dual-write to both clusters in parallel and then rely upon the cross-cluster replication feature to repair any discrepancies between the clusters caused by failed dual-writes. This recommendation is likely to change in the future once support for low-latency replication is added to M3DB in the form of commitlog tailing. +While cross-cluster replication is built on top of the background repairs feature, background repairs do not need to be enabled for cross-cluster replication to be enabled. In other words, clusters can be configured such that: +Background repairs (within a cluster) are disabled and replication is also disabled. +Background repairs (within a cluster) are enabled, but replication is disabled. +Background repairs (within a cluster) are disabled, but replication is enabled. +Background repairs (within a cluster) are enabled and replication is also enabled. +Configuration +Important: All M3DB clusters involved in the cross-cluster replication process must be configured such that they have the exact same: +Number of shards +Replication factor +Namespace configuration +The replication feature can be enabled by adding the following configuration to m3dbnode.yml under the db section: +db: + ... (other configuration) + replication: + clusters: + - name: "some-other-cluster" + repairEnabled: true + client: + config: + service: + env: + zone: + service: + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: + endpoints: + - : + +Note that the repairEnabled field in the configuration above is independent of the enabled field under the repairs section. For example, the example above will enable replication of data from some-other-cluster but will not perform background repairs within the cluster the M3DB node belongs to. +However, the following configuration: +db: + ... (other configuration) + repair: + enabled: true + + replication: + clusters: + - name: "some-other-cluster" + repairEnabled: true + client: + config: + service: + env: + zone: + service: + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: + endpoints: + - : + +would enable both replication of data from some-other-cluster as well as background repairs within the cluster that the M3DB node belongs to. diff --git a/docs-beta/content/reference_docs/configurations/topology_config.md b/docs-beta/content/reference_docs/configurations/topology_config.md index b84cd8e7fe..df50e9950a 100644 --- a/docs-beta/content/reference_docs/configurations/topology_config.md +++ b/docs-beta/content/reference_docs/configurations/topology_config.md @@ -1,6 +1,83 @@ --- -title: "Topology_config" +title: "Topology and placement" date: 2020-04-21T21:01:48-04:00 draft: true --- +Placement +Overview +Note: The words placement and topology are used interchangeably throughout the M3DB documentation and codebase. +A M3DB cluster has exactly one Placement. That placement maps the cluster's shard replicas to nodes. A cluster also has 0 or more namespaces (analogous to tables in other databases), and each node serves every namespace for the shards it owns. In other words, if the cluster topology states that node A owns shards 1, 2, and 3 then node A will own shards 1, 2, 3 for all configured namespaces in the cluster. +M3DB stores its placement (mapping of which NODES are responsible for which shards) in etcd. There are three possible states that each node/shard pair can be in: +Initializing +Available +Leaving +Note that these states are not a reflection of the current status of an M3DB node, but an indication of whether a given node has ever successfully bootstrapped and taken ownership of a given shard (achieved goal state). For example, in a new cluster all the nodes will begin with all of their shards in the Initializing state. Once all the nodes finish bootstrapping, they will mark all of their shards as Available. If all the M3DB nodes are stopped at the same time, the cluster placement will still show all of the shards for all of the nodes as Available. +Initializing State +The Initializing state is the state in which all new node/shard combinations begin. For example, upon creating a new placement all the node/shard pairs will begin in the Initializing state and only once they have successfully bootstrapped will they transition to the Available state. +The Initializing state is not limited to new placement, however, as it can also occur during placement changes. For example, during a node add/replace the new node will begin with all of its shards in the Initializing state until it can stream the data it is missing from its peers. During a node removal, all of the nodes who receive new shards (as a result of taking over the responsibilities of the node that is leaving) will begin with those shards marked as Initializing until they can stream in the data from the node leaving the cluster, or one of its peers. +Available State +Once a node with a shard in the Initializing state successfully bootstraps all of the data for that shard, it will mark that shard as Available (for the single node) in the cluster placement. +Leaving State +The Leaving state indicates that a node has been marked for removal from the cluster. The purpose of this state is to allow the node to remain in the cluster long enough for the nodes that are taking over its responsibilities to stream data from it. +Sample Cluster State Transitions - Node Add +Node adds are performed by adding the new node to the placement. Some portion of the existing shards will be assigned to the new node based on its weight, and they will begin in the Initializing state. Similarly, the shards will be marked as Leaving on the node that are destined to lose ownership of them. Once the new node finishes bootstrapping the shards, it will update the placement to indicate that the shards it owns are Available and that the Leaving node should no longer own that shard in the placement. +Replication factor: 3 + + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ Node A │ │ Node B │ │ Node C │ │ Node D │ +┌──────────────────────────┬─────┴─────────────────┴─────┬────┴─────────────────┴────┬───┴─────────────────┴───┬───┴─────────────────┴───┐ +│ │ ┌─────────────────────────┐ │ ┌───────────────────────┐ │ ┌──────────────────────┐│ │ +│ │ │ │ │ │ │ │ │ ││ │ +│ │ │ │ │ │ │ │ │ ││ │ +│ │ │ Shard 1: Available │ │ │ Shard 1: Available │ │ │ Shard 1: Available ││ │ +│ 1) Initial Placement │ │ Shard 2: Available │ │ │ Shard 2: Available │ │ │ Shard 2: Available ││ │ +│ │ │ Shard 3: Available │ │ │ Shard 3: Available │ │ │ Shard 3: Available ││ │ +│ │ │ │ │ │ │ │ │ ││ │ +│ │ │ │ │ │ │ │ │ ││ │ +│ │ └─────────────────────────┘ │ └───────────────────────┘ │ └──────────────────────┘│ │ +├──────────────────────────┼─────────────────────────────┼───────────────────────────┼─────────────────────────┼─────────────────────────┤ +│ │ │ │ │ │ +│ │ ┌─────────────────────────┐ │ ┌───────────────────────┐ │ ┌──────────────────────┐│┌──────────────────────┐ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ Shard 1: Leaving │ │ │ Shard 1: Available │ │ │ Shard 1: Available │││Shard 1: Initializing │ │ +│ 2) Begin Node Add │ │ Shard 2: Available │ │ │ Shard 2: Leaving │ │ │ Shard 2: Available │││Shard 2: Initializing │ │ +│ │ │ Shard 3: Available │ │ │ Shard 3: Available │ │ │ Shard 3: Leaving │││Shard 3: Initializing │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ └─────────────────────────┘ │ └───────────────────────┘ │ └──────────────────────┘│└──────────────────────┘ │ +│ │ │ │ │ │ +├──────────────────────────┼─────────────────────────────┼───────────────────────────┼─────────────────────────┼─────────────────────────┤ +│ │ │ │ │ │ +│ │ ┌─────────────────────────┐ │ ┌───────────────────────┐ │ ┌──────────────────────┐│┌──────────────────────┐ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ Shard 2: Available │ │ │ Shard 1: Available │ │ │ Shard 1: Available │││ Shard 1: Available │ │ +│ 3) Complete Node Add │ │ Shard 3: Available │ │ │ Shard 3: Available │ │ │ Shard 2: Available │││ Shard 2: Available │ │ +│ │ │ │ │ │ │ │ │ │││ Shard 3: Available │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ │ │ │ │ │ │ │ │││ │ │ +│ │ └─────────────────────────┘ │ └───────────────────────┘ │ └──────────────────────┘│└──────────────────────┘ │ +│ │ │ │ │ │ +└──────────────────────────┴─────────────────────────────┴───────────────────────────┴─────────────────────────┴─────────────────────────┘ + +Overview +M3DB was designed from the ground up to be a distributed (clustered) database that is availability zone or rack aware (by using isolation groups). Clusters will seamlessly scale with your data, and you can start with a small number of nodes and grow it to a size of several hundred nodes with no downtime or expensive migrations. +Before reading the rest of this document, we recommend familiarizing yourself with the M3DB placement documentation +Note: The primary limiting factor for the maximum size of an M3DB cluster is the number of shards. Picking an appropriate number of shards is more of an art than a science, but our recommendation is as follows: +The number of shards that M3DB uses is configurable and there are a couple of key points to note when deciding the number to use. The more nodes you have, the more shards you want because you want the shards to be evenly distributed amongst your nodes. However, because each shard requires more files to be created, you also don’t want to have too many shards per node. This is due to the fact each bit of data needs to be repartitioned and moved around the cluster (i.e. every bit of data needs to be moved all at once). Below are some guidelines depending on how many nodes you will have in your cluster eventually - you will need to decide the number of shards up front, you cannot change this once the cluster is created. +Number of Nodes +Number of Shards +3 +64 +6 +128 +12 +256 +24 +512 +48 +1024 +128+ +4096 diff --git a/docs-beta/content/reference_docs/operator.md b/docs-beta/content/reference_docs/operator.md index b3c231870d..5d491907a8 100644 --- a/docs-beta/content/reference_docs/operator.md +++ b/docs-beta/content/reference_docs/operator.md @@ -1,5 +1,5 @@ --- -title: "Operator" +title: "M3Operator" date: 2020-04-21T21:02:41-04:00 draft: true --- diff --git a/docs-beta/content/troubleshooting/_index.md b/docs-beta/content/troubleshooting/_index.md index 17d3d650eb..7b41eb7ca5 100644 --- a/docs-beta/content/troubleshooting/_index.md +++ b/docs-beta/content/troubleshooting/_index.md @@ -1,9 +1,9 @@ +++ title = "Troubleshooting" date = 2020-04-01T20:23:10-04:00 -weight = 12 +weight = 8 chapter = true -pre = "12. " +pre = "8. " +++ ### Section 12 diff --git a/docs-beta/content/troubleshooting/error_msgs.md b/docs-beta/content/troubleshooting/error_msgs.md index 658d359888..cbc2449ccd 100644 --- a/docs-beta/content/troubleshooting/error_msgs.md +++ b/docs-beta/content/troubleshooting/error_msgs.md @@ -1,6 +1,54 @@ --- -title: "Error_msgs" +title: "Error Messages" date: 2020-04-21T21:04:36-04:00 draft: true --- +Troubleshooting +Some common problems and resolutions +Ports 9001-9004 aren't open after starting m3db. +These ports will not open until a namespace and placement have been created and the nodes have bootstrapped. +Bootstrapping is slow +Double check your configuration against the bootstrapping guide. The nodes will log what bootstrapper they are using and what time range they are using it for. +If you're using the commitlog bootstrapper, and it seems to be slow, ensure that snapshotting is enabled for your namespace. Enabling snapshotting will require a node restart to take effect. +If an m3db node hasn't been able to snapshot for awhile, or is stuck in the commitlog bootstrapping phase for a long time due to accumulating a large number of commitlogs, consider using the peers bootstrapper. In situations where a large number of commitlogs need to be read, the peers bootstrapper will outperform the commitlog bootstrapper (faster and less memory usage) due to the fact that it will receive already-compressed data from its peers. Keep in mind that this will only work with a replication factor of 3 or larger and if the nodes peers are healthy and bootstrapped. Review the bootstrapping guide for more information. +Nodes a crashing with memory allocation errors, but there's plenty of available memory +Ensure you've set vm.max_map_count to something like 262,144 using sysctl. Find out more in the Clustering the Hard Way document. +What to do if my M3DB node is OOM’ing? +Ensure that you are not co-locating coordinator, etcd or query nodes with your M3DB nodes. Colocation or embedded mode is fine for a development environment, but highly discouraged in production. +Check to make sure you are running adequate block sizes based on the retention of your namespace. See namespace configuration for more information. +Ensure that you use at most 50-60% memory utilization in the normal running state. You want to ensure enough overhead to handle bursts of metrics, especially ones with new IDs as those will take more memory initially. +High cardinality metrics can also lead to OOMs especially if you are not adequately provisioned. If you have many unique timeseries such as ones containing UUIDs or timestamps as tag values, you should consider mitigating their cardinality. +Using the /debug/dump API +The /debug/dump API returns a number of helpful debugging outputs. Currently, we support the following: +CPU profile: determines where a program spends its time while actively consuming CPU cycles (as opposed to while sleeping or waiting for I/O). Currently set to take a 5 second profile. +Heap profile: reports memory allocation samples; used to monitor current and historical memory usage, and to check for memory leaks. +Goroutines profile: reports the stack traces of all current goroutines. +Host profile: returns data about the underlying host such as PID, working directory, etc. +Namespace: returns information about the namespaces setup in M3DB +Placement: returns information about the placement setup in M3DB +This endpoint can be used on both the db nodes as well as the coordinator/query nodes. However, namespace and placement info are only available on the coordinator debug endpoint. +To use this, simply run the following on either the M3DB debug listen port or the regular port on M3Coordinator. +curl :/debug/dump > + +# unzip the file +unzip + +Now, you will have the following files, which you can use for troubleshooting using the below commands: +cpuSource +go tool pprof -http=:16000 cpuSource + +heapSource +go tool pprof -http=:16000 heapSource + +goroutineProfile +less goroutineProfile + +hostSource +less hostSource | jq . + +namespaceSource +less namespaceSource | jq . + +placementSource +less placementSource | jq . diff --git a/docs-beta/content/troubleshooting/faqs.md b/docs-beta/content/troubleshooting/faqs.md index 57c1b372a8..0ddd12626e 100644 --- a/docs-beta/content/troubleshooting/faqs.md +++ b/docs-beta/content/troubleshooting/faqs.md @@ -1,6 +1,37 @@ --- -title: "Faqs" +title: "FAQs" date: 2020-04-21T21:04:28-04:00 draft: true --- +FAQs +Is there a way to disable M3DB embedded etcd and just use an external etcd cluster? Yes, you can definitely do that. It's all just about setting the etcd endpoints in config as etcd hosts instead of M3DB hosts. See these docs for more information. +Is there a client that lets me send metrics to m3coordinator without going through Prometheus? Yes, you can use the Prometheus remote write client. +Why does my dbnode keep OOM’ing? Refer to the troubleshooting guide. +Do you support PromQL? Yes, M3Query and M3Coordinator both support PromQL. +Do you support Graphite? Yes, M3Query and M3Coordinator both support Graphite. +Does M3DB store both data and (tag) metadata on the same node? Yes it stores the data (i.e. the timeseries datapoints) as well as the tags since it has an embedded index. Make sure you have IndexEnabled set to true in your namespace configuration +How are writes handled and how is the data kept consistent within M3DB? M3 uses quorum/majority consistency to ensure data is written to replicas in a way that can be read back consistently. For example, if you have a replication factor of 3 and your set your write and read consistencies to quorum, then all writes will only succeed if they make it to at least 2 of the 3 replicas, and reads will only succeed if they get results back from at least 2 of the 3 replicas +Do I need to restart M3DB if I add a namespace? If you’re adding namespaces, the m3dbnode process will pickup the new namespace without a restart. +Do I need to restart M3DB if I change or delete a namespace? If you’re removing or modifying an existing namespace, you’ll need to restart the m3dbnode process in order to complete the namespace deletion/modification process. It is recommended to restart one node at a time and wait for a node to be completely bootstrapped before restarting another node. +How do I set up aggregation in the coordinator? Refer to the Aggregation section of the M3Query how-to guide. +How do I set up aggregation using a separate aggregation tier? See this WIP documentation. +Can you delete metrics from M3DB? Not yet, but that functionality is currently being worked on. +How can you tell if a node is snapshotting? You can check if your nodes are snapshotting by looking at the Background tasks tab in the M3DB Grafana dashboard. +How do you list all available API endpoints? See [M3DB openhttps://m3db.io/openapi +What is the recommended way to upgrade my M3 stack? TBA +When graphing my Prometheus data in Grafana, I see gaps. How do I resolve this? This is due to M3 having a concept of null datapoints whereas Prometheus does not. To resolve this, change Stacking & Null value to Connected under the Visualization tab of your graph. +I am receiving the error "could not create etcd watch","error":"etcd watch create timed out after 10s for key: _sd.placement/default_env/m3db" This is due to the fact that M3DB, M3Coordinator, etc. could not connect to the etcd server. Make sure that the endpoints listed under in the following config section are correct AND the correct configuration file is being used. +etcdClusters: + - zone: embedded + endpoints: + - HOST1_STATIC_IP_ADDRESS:2379 + - HOST2_STATIC_IP_ADDRESS:2379 + - HOST3_STATIC_IP_ADDRESS:2379 + + +How can I get a heap dump, cpu profile, etc. See our docs on the /debug/dump api +How much memory utilization should I run M3DB at? We recommend not going above 50%. +What is the recommended hardware to run on? TBA +What is the recommended way to create a new namespace? Refer to the Namespace configuration guide. +How can I see the cardinality of my metrics? Currently, the best way is to go to the M3DB Node Details Dashboard and look at the Ticking panel. However, this is not entirely accurate because of the way data is stored in M3DB -- time series are stored inside time-based blocks that you configure. In actuality, the Ticking graph shows you how many unique series there are for the most recent block that has persisted. In the future, we plan to introduce easier ways to determine the number of unique time series. diff --git a/docs-beta/content/troubleshooting/file_issue.md b/docs-beta/content/troubleshooting/file_issue.md index 8018842d2c..a4a4fe526b 100644 --- a/docs-beta/content/troubleshooting/file_issue.md +++ b/docs-beta/content/troubleshooting/file_issue.md @@ -1,5 +1,5 @@ --- -title: "File_issue" +title: "File an issue" date: 2020-04-21T21:04:51-04:00 draft: true --- diff --git a/docs-beta/content/troubleshooting/resources.md b/docs-beta/content/troubleshooting/resources.md index 3e1fe12491..159a214422 100644 --- a/docs-beta/content/troubleshooting/resources.md +++ b/docs-beta/content/troubleshooting/resources.md @@ -1,5 +1,5 @@ --- -title: "Resources" +title: "Support resources" date: 2020-04-21T21:04:43-04:00 draft: true --- diff --git a/docs-beta/content/versions/_index.md b/docs-beta/content/versions/_index.md index 2961d7b986..4890c62dfd 100644 --- a/docs-beta/content/versions/_index.md +++ b/docs-beta/content/versions/_index.md @@ -1,13 +1,10 @@ +++ title = "Versions" date = 2020-04-21T21:05:08-04:00 -weight = 5 +weight = 11 chapter = true -pre = "X. " +pre = "11. " +++ -### Chapter X +### Documentation versions -# Some Chapter title - -Lorem Ipsum. \ No newline at end of file