fix(browse): Fixing browse path to remove requirement for simple name…

… suffix (#5634)
datahub-project · Sep 7, 2022 · d15518f · d15518f
1 parent 5a07745
commit d15518f
Show file tree

Hide file tree

Showing 19 changed files with 1,096 additions and 100 deletions.
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/DataMigrationStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/DataMigrationStep.java
@@ -17,7 +17,6 @@
 import com.linkedin.metadata.entity.ebean.EbeanAspectV1;
 import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
 import com.linkedin.metadata.models.EntitySpec;
-import com.linkedin.metadata.search.utils.BrowsePathUtils;
 import io.ebean.EbeanServer;
 import io.ebean.PagedList;
 import java.net.URISyntaxException;
@@ -151,7 +150,7 @@ public Function<UpgradeContext, UpgradeStepResult> executable() {
             // Emit a browse path aspect.
             final BrowsePaths browsePaths;
             try {
-              browsePaths = BrowsePathUtils.buildBrowsePath(urn, _entityService.getEntityRegistry());
+              browsePaths = _entityService.buildDefaultBrowsePath(urn);
 
               final AuditStamp browsePathsStamp = new AuditStamp();
               browsePathsStamp.setActor(Urn.createFromString(Constants.SYSTEM_ACTOR));

diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env
@@ -43,3 +43,7 @@ ENTITY_SERVICE_ENABLE_RETENTION=true
 # set ELASTICSEARCH_USE_SSL=true and uncomment:
 # ELASTICSEARCH_USERNAME=
 # ELASTICSEARCH_PASSWORD=
+
+# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
+# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
+# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env
@@ -72,3 +72,7 @@ UI_INGESTION_DEFAULT_CLI_VERSION=0.8.42
 
 # Uncomment to increase concurrency across Kafka consumers
 # KAFKA_LISTENER_CONCURRENCY=2
+
+# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
+# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
+# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -433,6 +433,7 @@ module.exports = {
           "metadata-ingestion/adding-source",
           "docs/how/add-custom-ingestion-source",
           "docs/how/add-custom-data-platform",
+          "docs/advanced/browse-paths-upgrade",
         ],
       },
     ],

diff --git a/docs/advanced/browse-paths-upgrade.md b/docs/advanced/browse-paths-upgrade.md
@@ -0,0 +1,137 @@
+# Browse Paths Upgrade (August 2022)
+
+## Background
+
+Up to this point, there's been a historical constraint on all entity browse paths. Namely, each browse path has been
+required to end with a path component that represents "simple name" for an entity. For example, a Browse Path for a 
+Snowflake Table called "test_table" may look something like this:
+
+```
+/prod/snowflake/warehouse1/db1/test_table
+```
+
+In the UI, we artificially truncate the final path component when you are browsing the Entity hierarchy, so your browse experience 
+would be: 
+
+`prod` > `snowflake` > `warehouse1`> `db1` > `Click Entity`
+
+As you can see, the final path component `test_table` is effectively ignored. It could have any value, and we would still ignore
+it in the UI. This behavior serves as a workaround to the historical requirement that all browse paths end with a simple name. 
+
+This data constraint stands in opposition the original intention of Browse Paths: to provide a simple mechanism for organizing
+assets into a hierarchical folder structure. For this reason, we've changed the semantics of Browse Paths to better align with the original intention. 
+Going forward, you will not be required to provide a final component detailing the "name". Instead, you will be able to provide a simpler path that
+omits this final component:
+
+```
+/prod/snowflake/warehouse1/db1
+```
+
+and the browse experience from the UI will continue to work as you would expect: 
+
+`prod` > `snowflake` > `warehouse1`> `db1` > `Click Entity`. 
+
+With this change comes a fix to a longstanding bug where multiple browse paths could not be attached to a single URN. Going forward,
+we will support producing multiple browse paths for the same entity, and allow you to traverse via multiple paths. For example
+
+```python
+browse_path = BrowsePathsClass(
+    paths=["/powerbi/my/custom/path", "/my/other/custom/path"]
+)
+return MetadataChangeProposalWrapper(
+    entityType="dataset",
+    changeType="UPSERT",
+    entityUrn="urn:li:dataset:(urn:li:dataPlatform:custom,MyFileName,PROD),
+    aspectName="browsePaths",
+    aspect=browse_path,
+)
+```
+*Using the Python Emitter SDK to produce multiple Browse Paths for the same entity*
+
+We've received multiple bug reports, such as [this issue](https://github.com/datahub-project/datahub/issues/5525), and requests to address these issues with Browse, and thus are deciding
+to do it now before more workarounds are created.  
+
+## What this means for you
+
+Once you upgrade to DataHub `v0.8.45` you will immediately notice that traversing your Browse Path hierarchy will require
+one extra click to find the entity. This is because we are correctly displaying the FULL browse path, including the simple name mentioned above.
+
+There will be 2 ways to upgrade to the new browse path format. Depending on your ingestion sources, you may want to use one or both:
+
+1. Migrate default browse paths to the new format by restarting DataHub
+2. Upgrade your version of the `datahub` CLI to push new browse path format (version `v0.8.45`)
+
+Each step will be discussed in detail below. 
+
+### 1. Migrating default browse paths to the new format
+
+To migrate those Browse Paths that are generated by DataHub by default (when no path is provided), simply restart the `datahub-gms` container / pod with a single
+additional environment variable:
+
+```
+UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
+```
+
+And restart the `datahub-gms` instance. This will cause GMS to perform a boot-time migration of all your existing Browse Paths
+to the new format, removing the unnecessarily name component at the very end.
+
+If the migration is successful, you'll see the following in your GMS logs: 
+
+```
+18:58:17.414 [main] INFO c.l.m.b.s.UpgradeDefaultBrowsePathsStep:60 - Successfully upgraded all browse paths!
+```
+
+After this one-time migration is complete, you should be able to navigate the Browse hierarchy exactly as you did previously. 
+
+> Note that certain ingestion sources actively produce their own Browse Paths, which overrides the default path
+> computed by DataHub. 
+> 
+> In these cases, getting the updated Browse Path will require re-running your ingestion process with the updated
+> version of the connector. This is discussed in more detail in the next section. 
+
+### 2. Upgrading the `datahub` CLI to push new browse paths 
+
+If you are actively ingesting metadata from one or more of following sources
+
+1. Sagemaker
+2. Looker / LookML
+3. Feast
+4. Kafka
+5. Mode
+6. PowerBi
+7. Pulsar
+8. Tableau
+9. Business Glossary
+
+You will need to upgrade the DataHub CLI to >= `v0.8.45` and re-run metadata ingestion. This will generate the new browse path format
+and overwrite the existing paths for entities that were extracted from these sources. 
+
+### If you are producing custom Browse Paths
+
+If you've decided to produce your own custom Browse Paths to organize your assets (e.g. via the Python Emitter SDK), you'll want to change the code to produce those paths
+to truncate the final path component. For example, if you were previously emitting a browse path like this:
+
+```
+"my/custom/browse/path/suffix"
+```
+
+You can simply remove the final "suffix" piece:
+
+```
+"my/custom/browse/path"
+```
+
+Your users will be able to find the entity by traversing through these folders in the UI:
+
+`my` > `custom` > `browse`> `path` > `Click Entity`.
+
+
+> Note that if you are using the Browse Path Transformer you *will* be impacted in the same way. It is recommended that you revisit the
+> paths that you are producing, and update them to the new format. 
+
+## Support
+
+The Acryl team will be on standby to assist you in your migration. Please
+join [#release-0_8_0](https://datahubspace.slack.com/archives/C0244FHMHJQ) channel and reach out to us if you find
+trouble with the upgrade or have feedback on the process. We will work closely to make sure you can continue to operate
+DataHub smoothly.
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -6,6 +6,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 
 ### Breaking Changes
 
+- Browse Paths have been upgraded to a new format to align more closely with the intention of the feature. 
+  Learn more about the changes, including steps on upgrading, here: https://datahubproject.io/docs/advanced/browse-paths-upgrade
+
 ### Potential Downtime
 
 ### Deprecations

diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityService.java
@@ -19,7 +19,9 @@
 import com.linkedin.data.schema.TyperefDataSchema;
 import com.linkedin.data.template.DataTemplateUtil;
 import com.linkedin.data.template.RecordTemplate;
+import com.linkedin.data.template.StringArray;
 import com.linkedin.data.template.UnionTemplate;
+import com.linkedin.dataplatform.DataPlatformInfo;
 import com.linkedin.entity.AspectType;
 import com.linkedin.entity.Entity;
 import com.linkedin.entity.EntityResponse;
@@ -38,7 +40,6 @@
 import com.linkedin.metadata.models.registry.EntityRegistry;
 import com.linkedin.metadata.query.ListUrnsResult;
 import com.linkedin.metadata.run.AspectRowSummary;
-import com.linkedin.metadata.search.utils.BrowsePathUtils;
 import com.linkedin.metadata.snapshot.Snapshot;
 import com.linkedin.metadata.utils.DataPlatformInstanceUtils;
 import com.linkedin.metadata.utils.EntityKeyUtils;
@@ -73,6 +74,7 @@
 import lombok.extern.slf4j.Slf4j;
 
 import static com.linkedin.metadata.Constants.*;
+import static com.linkedin.metadata.search.utils.BrowsePathUtils.*;
 import static com.linkedin.metadata.utils.PegasusUtils.*;
 
 
@@ -142,7 +144,6 @@ public static class IngestProposalResult {
   public static final String BROWSE_PATHS = "browsePaths";
   public static final String DATA_PLATFORM_INSTANCE = "dataPlatformInstance";
   protected static final int MAX_KEYS_PER_QUERY = 500;
-  public static final String STATUS = "status";
 
   public EntityService(
       @Nonnull final AspectDao aspectDao,
@@ -1180,10 +1181,8 @@ public List<Pair<String, RecordTemplate>> generateDefaultAspectsIfMissing(@Nonnu
 
     if (shouldCheckBrowsePath && latestAspects.get(BROWSE_PATHS) == null) {
       try {
-        BrowsePaths generatedBrowsePath = BrowsePathUtils.buildBrowsePath(urn, getEntityRegistry());
-        if (generatedBrowsePath != null) {
-          aspects.add(Pair.of(BROWSE_PATHS, generatedBrowsePath));
-        }
+        BrowsePaths generatedBrowsePath = buildDefaultBrowsePath(urn);
+        aspects.add(Pair.of(BROWSE_PATHS, generatedBrowsePath));
       } catch (URISyntaxException e) {
         log.error("Failed to parse urn: {}", urn);
       }
@@ -1762,4 +1761,54 @@ private RecordTemplate updateAspect(
 
     return newValue;
   }
+
+  /**
+   * Builds the default browse path aspects for a subset of well-supported entities.
+   *
+   * This method currently supports datasets, charts, dashboards, data flows, data jobs, and glossary terms.
+   */
+  @Nonnull
+  public BrowsePaths buildDefaultBrowsePath(final @Nonnull Urn urn) throws URISyntaxException {
+    Character dataPlatformDelimiter = getDataPlatformDelimiter(urn);
+    String defaultBrowsePath = getDefaultBrowsePath(urn, this.getEntityRegistry(), dataPlatformDelimiter);
+    StringArray browsePaths = new StringArray();
+    browsePaths.add(defaultBrowsePath);
+    BrowsePaths browsePathAspect = new BrowsePaths();
+    browsePathAspect.setPaths(browsePaths);
+    return browsePathAspect;
+  }
+
+  /**
+   * Returns a delimiter on which the name of an asset may be split.
+   */
+  private Character getDataPlatformDelimiter(Urn urn) {
+    // Attempt to construct the appropriate Data Platform URN
+    Urn dataPlatformUrn = buildDataPlatformUrn(urn, this.getEntityRegistry());
+    if (dataPlatformUrn != null) {
+      // Attempt to resolve the delimiter from Data Platform Info
+      DataPlatformInfo dataPlatformInfo = getDataPlatformInfo(dataPlatformUrn);
+      if (dataPlatformInfo != null && dataPlatformInfo.hasDatasetNameDelimiter()) {
+        return dataPlatformInfo.getDatasetNameDelimiter().charAt(0);
+      }
+    }
+    // Else, fallback to a default delimiter (period) if one cannot be resolved.
+    return '.';
+  }
+
+  @Nullable
+  private DataPlatformInfo getDataPlatformInfo(Urn urn) {
+    try {
+      final EntityResponse entityResponse = getEntityV2(
+          Constants.DATA_PLATFORM_ENTITY_NAME,
+          urn,
+          ImmutableSet.of(Constants.DATA_PLATFORM_INFO_ASPECT_NAME)
+      );
+      if (entityResponse != null && entityResponse.hasAspects() && entityResponse.getAspects().containsKey(Constants.DATA_PLATFORM_INFO_ASPECT_NAME)) {
+        return new DataPlatformInfo(entityResponse.getAspects().get(Constants.DATA_PLATFORM_INFO_ASPECT_NAME).getValue().data());
+      }
+    } catch (Exception e) {
+      log.warn(String.format("Failed to find Data Platform Info for urn %s", urn));
+    }
+    return null;
+  }
 }
diff --git a/...ata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/...ata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java
@@ -127,12 +127,12 @@ public Map<String, Long> aggregateByValue(@Nullable String entityName, @Nonnull
 
   @Nonnull
   @Override
-  public BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable Filter requestParams, int from,
+  public BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable Filter filters, int from,
       int size) {
     log.debug(
-        String.format("Browsing entities entityName: %s, path: %s, requestParams: %s, from: %s, size: %s", entityName,
-            path, requestParams, from, size));
-    return esBrowseDAO.browse(entityName, path, requestParams, from, size);
+        String.format("Browsing entities entityName: %s, path: %s, filters: %s, from: %s, size: %s", entityName,
+            path, filters, from, size));
+    return esBrowseDAO.browse(entityName, path, filters, from, size);
   }
 
   @Nonnull