diff --git a/datahub-frontend/README.md b/datahub-frontend/README.md index 616f8aafe35d0a..cdb82b85dc98e3 100644 --- a/datahub-frontend/README.md +++ b/datahub-frontend/README.md @@ -1,4 +1,8 @@ -# DataHub Frontend +--- +title: "datahub-frontend" +--- + +# DataHub Frontend Proxy DataHub frontend is a [Play](https://www.playframework.com/) service written in Java. It is served as a mid-tier between [DataHub GMS](../metadata-service) which is the backend service and [DataHub Web](../datahub-web-react/README.md). diff --git a/datahub-graphql-core/README.md b/datahub-graphql-core/README.md index 710062c2c7407a..d89599b0cc18bc 100644 --- a/datahub-graphql-core/README.md +++ b/datahub-graphql-core/README.md @@ -1,4 +1,6 @@ -Notice: `datahub-graphql-core` is currently in beta, and as such is currently subject to backwards incompatible changes. +--- +title: "datahub-graphql-core" +--- # DataHub GraphQL Core DataHub GraphQL API is a shared lib module containing a GraphQL API on top of the GMS service layer. It exposes a graph-based representation diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md index 86be2f457e7d88..e767c76488eea7 100644 --- a/datahub-web-react/README.md +++ b/datahub-web-react/README.md @@ -1,3 +1,7 @@ +--- +title: "datahub-web-react" +--- + # DataHub React App ## About diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index eff9d6b926ecc9..ab265311733f97 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -46,6 +46,16 @@ module.exports = { "README", // "docs/faq", // hide from sidebar: out of date "docs/features", + { + Architecture: [ + "docs/architecture/architecture", + "docs/components", + "docs/architecture/metadata-ingestion", + "docs/architecture/metadata-serving", + // "docs/what/gma", + // "docs/what/gms", + ], + }, "docs/roadmap", "docs/CONTRIBUTING", "docs/demo", @@ -53,19 +63,10 @@ module.exports = { "releases", ], "Getting Started": [ - // Serves as user guides. "docs/quickstart", "docs/cli", - "docs/debugging", "metadata-ingestion/README", - "docs/policies", - ], - Architecture: [ - "docs/architecture/architecture", - "docs/architecture/metadata-ingestion", - //"docs/what/gma", - "docs/architecture/metadata-serving", - //"docs/what/gms", + "docs/debugging", ], "Metadata Ingestion": [ // add a custom label since the default is 'Metadata Ingestion' @@ -163,6 +164,7 @@ module.exports = { ], }, ], + "Usage Guides": ["docs/policies"], "Developer Guides": [ // TODO: the titles of these should not be in question form in the sidebar "docs/developers", @@ -191,34 +193,38 @@ module.exports = { "metadata-ingestion/developing", "docker/airflow/local_airflow", "docs/how/add-custom-data-platform", - ], - Components: [ - "datahub-web-react/README", - "datahub-frontend/README", - "datahub-graphql-core/README", - "metadata-service/README", - // "metadata-jobs/README", - "metadata-jobs/mae-consumer-job/README", - "metadata-jobs/mce-consumer-job/README", - ], - "Advanced Guides": [ - "docs/advanced/no-code-modeling", - "docs/advanced/aspect-versioning", - "docs/advanced/es-7-upgrade", - "docs/advanced/high-cardinality", - "docs/advanced/no-code-upgrade", - "docs/how/migrating-graph-service-implementation", - "docs/advanced/mcp-mcl", - "docs/advanced/field-path-spec-v2", - "docs/advanced/monitoring", "docs/how/add-custom-ingestion-source", - // WIP "docs/advanced/backfilling", - // WIP "docs/advanced/derived-aspects", - // WIP "docs/advanced/entity-hierarchy", - // WIP "docs/advanced/partial-update", - // WIP "docs/advanced/pdl-best-practices", + { + "Module READMEs": [ + "datahub-web-react/README", + "datahub-frontend/README", + "datahub-graphql-core/README", + "metadata-service/README", + // "metadata-jobs/README", + "metadata-jobs/mae-consumer-job/README", + "metadata-jobs/mce-consumer-job/README", + ], + }, + { + Advanced: [ + "docs/advanced/no-code-modeling", + "docs/advanced/aspect-versioning", + "docs/advanced/es-7-upgrade", + "docs/advanced/high-cardinality", + "docs/advanced/no-code-upgrade", + "docs/how/migrating-graph-service-implementation", + "docs/advanced/mcp-mcl", + "docs/advanced/field-path-spec-v2", + "docs/advanced/monitoring", + // WIP "docs/advanced/backfilling", + // WIP "docs/advanced/derived-aspects", + // WIP "docs/advanced/entity-hierarchy", + // WIP "docs/advanced/partial-update", + // WIP "docs/advanced/pdl-best-practices", + ], + }, ], - Deployment: [ + "Deployment Guides": [ "docs/how/kafka-config", "docker/README", "docs/deploy/kubernetes", diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 54e8a94aebb678..55aa7623c9bd70 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -1,24 +1,18 @@ +--- +title: "Overview" +--- + # DataHub Architecture Overview -We highly recommend that you read the excellent [metadata architectures blog post] that describes the three generations of metadata architectures, and goes into a -lot of detail around the motivations and evolution of the DataHub architecture in comparison with other data discovery solutions and catalogs. +DataHub is a [3rd generation](https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained) Metadata Platform that enables Data Discovery, Collaboration, Governance, and end-to-end Observability +that is built for the Modern Data Stack. DataHub employs a model-first philosophy, with a focus on unlocking interoperability between +disparate tools & systems. -The figure below describes the high-level architecture of DataHub, a third-generation metadata platform. +The figure below describes the high-level architecture of DataHub. ![datahub-architecture](../imgs/datahub-architecture.png) -## The Components -The DataHub deployables are split into three components: - -### Ingestion -This component controls how metadata is integrated with DataHub. Read [datahub-ingestion] to learn more. - -### Serving -The component is responsible for storing and querying data within DataHub. Read [datahub-serving] to learn more. - -### Frontend -This is the user-facing application that powers search and discovery over the metadata graph. Read [react-frontend] to learn more. - +For a more detailed look at the components that make up the Architecture, check out [Components](../components.md). ## Architecture Highlights There are three main highlights of DataHub's architecture. diff --git a/docs/architecture/metadata-ingestion.md b/docs/architecture/metadata-ingestion.md index 9c2b58a74c2078..17177b083d0618 100644 --- a/docs/architecture/metadata-ingestion.md +++ b/docs/architecture/metadata-ingestion.md @@ -1,3 +1,7 @@ +--- +title: "Ingestion Framework" +--- + # Metadata Ingestion Architecture DataHub supports an extremely flexible ingestion architecture that can support push, pull, asynchronous and synchronous models. diff --git a/docs/architecture/metadata-serving.md b/docs/architecture/metadata-serving.md index 8fae13d5394d6f..f26e6d17383ff1 100644 --- a/docs/architecture/metadata-serving.md +++ b/docs/architecture/metadata-serving.md @@ -1,3 +1,7 @@ +--- +title: "Serving Tier" +--- + # DataHub Serving Architecture The figure below shows the high-level system diagram for DataHub's Serving Tier. diff --git a/docs/components.md b/docs/components.md new file mode 100644 index 00000000000000..ef76729bb37fbf --- /dev/null +++ b/docs/components.md @@ -0,0 +1,60 @@ +--- +title: "Components" +--- + +# DataHub Components Overview + +The DataHub platform consists of the components shown in the following diagram. + +![DataHub Component Overview](./imgs/datahub-components.png) + +## Metadata Store + +The Metadata Store is responsible for storing the [Entities & Aspects](https://datahubproject.io/docs/metadata-modeling/metadata-model/) comprising the Metadata Graph. This includes +exposing an API for [ingesting metadata](https://datahubproject.io/docs/metadata-service#ingesting-entities), [fetching Metadata by primary key](https://datahubproject.io/docs/metadata-service#retrieving-entities), [searching entities](https://datahubproject.io/docs/metadata-service#search-an-entity), and [fetching Relationships](https://datahubproject.io/docs/metadata-service#get-relationships-edges) between +entities. It consists of a Spring Java Service hosting a set of [Rest.li](https://linkedin.github.io/rest.li/) API endpoints, along with +MySQL, Elasticsearch, & Kafka for primary storage & indexing. + +Get started with the Metadata Store by following the [Quickstart Guide](https://datahubproject.io/docs/quickstart/). + +## Metadata Models + +Metadata Models are schemas defining the shape of the Entities & Aspects comprising the Metadata Graph, along with the relationships between them. They are defined +using [PDL](https://linkedin.github.io/rest.li/pdl_schema), a modeling language quite similar in form to Protobuf while serializes to JSON. Entities represent a specific class of Metadata +Asset such as a Dataset, a Dashboard, a Data Pipeline, and beyond. Each *instance* of an Entity is identified by a unique identifier called an `urn`. Aspects represent related bundles of data attached +to an instance of an Entity such as its descriptions, tags, and more. View the current set of Entities supported [here](https://datahubproject.io/docs/metadata-modeling/metadata-model#exploring-datahubs-metadata-model). + +Learn more about DataHub models Metadata [here](https://datahubproject.io/docs/metadata-modeling/metadata-model/). + +## Ingestion Framework + +The Ingestion Framework is a modular, extensible Python library for extracting Metadata from external source systems (e.g. +Snowflake, Looker, MySQL, Kafka), transforming it into DataHub's [Metadata Model](https://datahubproject.io/docs/metadata-modeling/metadata-model/), and writing it into DataHub via +either Kafka or using the Metadata Store Rest APIs directly. DataHub supports an [extensive list of Source connectors](https://datahubproject.io/docs/metadata-ingestion/#installing-plugins) to choose from, along with +a host of capabilities including schema extraction, table & column profiling, usage information extraction, and more. + +Getting started with the Ingestion Framework is as simple: just define a YAML file and execute the `datahub ingest` command. +Learn more by heading over the the [Metadata Ingestion](https://datahubproject.io/docs/metadata-ingestion/) guide. + +## GraphQL API + +The [GraphQL](https://graphql.org/) API provides a strongly-typed, entity-oriented API that makes interacting with the Entities comprising the Metadata +Graph simple, including APIs for adding and removing tags, owners, links & more to Metadata Entities! Most notably, this API is consumed by the User Interface (discussed below) for enabling Search & Discovery, Governance, Observability +and more. + +To get started using the GraphQL API, check out the [Getting Started with GraphQL](https://datahubproject.io/docs/api/graphql/getting-started) guide. + +## User Interface + +DataHub comes with a React UI including an ever-evolving set of features to make Discovering, Governing, & Debugging your Data Assets easy & delightful. +For a full overview of the capabilities currently supported, take a look at the [Features](https://datahubproject.io/docs/features/) overview. For a look at what's coming next, +head over to the [Roadmap](https://datahubproject.io/docs/roadmap/). + +## Learn More + +Learn more about the specifics of the [DataHub Architecture](./architecture/architecture.md) in the Architecture Overview. Learn about using & developing the components +of the Platform by visiting the Module READMEs. + +## Feedback / Questions / Concerns + +We want to hear from you! For any inquiries, including Feedback, Questions, or Concerns, reach out on [Slack](https://datahubspace.slack.com/join/shared_invite/zt-nx7i0dj7-I3IJYC551vpnvvjIaNRRGw#/shared-invite/email)! diff --git a/docs/developers.md b/docs/developers.md index 4796aae8711c13..575d0ee3f7c642 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -1,3 +1,7 @@ +--- +title: "Local Development" +--- + # DataHub Developer's Guide ## Building the Project diff --git a/docs/how/add-custom-ingestion-source.md b/docs/how/add-custom-ingestion-source.md index 806af937b4bb9a..73e058c1e47929 100644 --- a/docs/how/add-custom-ingestion-source.md +++ b/docs/how/add-custom-ingestion-source.md @@ -1,3 +1,8 @@ +--- +title: "Using a Custom Ingestion Source" +--- + + # How to use a custom ingestion source without forking Datahub? Adding a custom ingestion source is the easiest way to extend Datahubs ingestion framework to support source systems diff --git a/docs/imgs/datahub-components.png b/docs/imgs/datahub-components.png new file mode 100644 index 00000000000000..8b7d0e5330275a Binary files /dev/null and b/docs/imgs/datahub-components.png differ diff --git a/metadata-jobs/mae-consumer-job/README.md b/metadata-jobs/mae-consumer-job/README.md index 5a5c920b606dd5..3fbc3ff5bc544c 100644 --- a/metadata-jobs/mae-consumer-job/README.md +++ b/metadata-jobs/mae-consumer-job/README.md @@ -1,3 +1,7 @@ +--- +title: "metadata-jobs:mae-consumer-job" +--- + # MetadataAuditEvent (MAE) Consumer Job MAE Consumer is a [Kafka Streams](https://kafka.apache.org/documentation/streams/) job. Its main function is to listen `MetadataAuditEvent` Kafka topic for messages and process those messages, converting changes in the metadata model into updates diff --git a/metadata-jobs/mce-consumer-job/README.md b/metadata-jobs/mce-consumer-job/README.md index 0aefe70d75bad2..ab622d73b7b229 100644 --- a/metadata-jobs/mce-consumer-job/README.md +++ b/metadata-jobs/mce-consumer-job/README.md @@ -1,3 +1,7 @@ +--- +title: "metadata-jobs:mce-consumer-job" +--- + # MetadataChangeEvent (MCE) Consumer Job MCE Consumer is a [Kafka Streams](https://kafka.apache.org/documentation/streams/) job. Its main function is to listen `MetadataChangeEvent` Kafka topic for messages and process those messages and writes new metadata to `DataHub GMS`. diff --git a/metadata-service/README.md b/metadata-service/README.md index a2d52369fc249b..135f683df3f549 100644 --- a/metadata-service/README.md +++ b/metadata-service/README.md @@ -1,3 +1,7 @@ +--- +title: "metadata-service" +--- + # DataHub Metadata Service (Also known as GMS) DataHub Metadata Service is a service written in Java consisting of multiple servlets: