From a4d4ff63aa77646839b7bfdd81f0290fcc508c5e Mon Sep 17 00:00:00 2001 From: David Leifker Date: Thu, 29 Dec 2022 16:46:36 -0600 Subject: [PATCH 01/12] quickstart updates --- docker/build.gradle | 1 + .../datahub-gms/env/docker-without-neo4j.env | 1 - docker/datahub-gms/env/docker.env | 1 - .../env/docker-without-neo4j.env | 47 +++++++++++++++++++ docker/datahub-mce-consumer/env/docker.env | 12 ++++- docker/docker-compose-with-cassandra.yml | 3 -- docker/docker-compose-without-neo4j.yml | 3 -- ...docker-compose.consumers-without-neo4j.yml | 2 +- docker/docker-compose.yml | 3 -- docker/kafka-setup/kafka-setup.sh | 6 +-- ...er-compose-without-neo4j-m1.quickstart.yml | 1 - ...ocker-compose-without-neo4j.quickstart.yml | 22 ++++++++- ...ose.consumers-without-neo4j.quickstart.yml | 8 ++-- .../docker-compose.consumers.quickstart.yml | 14 +++--- .../quickstart/docker-compose.quickstart.yml | 22 ++++++++- .../src/datahub/cli/docker_check.py | 2 + 16 files changed, 118 insertions(+), 30 deletions(-) create mode 100644 docker/datahub-mce-consumer/env/docker-without-neo4j.env diff --git a/docker/build.gradle b/docker/build.gradle index 6e9ced11a05687..d9670ba55b075e 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -9,6 +9,7 @@ ext { ':docker:elasticsearch-setup', ':docker:mysql-setup', ':docker:kafka-setup', + ':datahub-upgrade', ':metadata-jobs:mce-consumer-job', ':metadata-jobs:mae-consumer-job', ':metadata-service:war', diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index 2b8d2d5f62f9e4..e1917f3bbd2b94 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -8,7 +8,6 @@ KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 ELASTICSEARCH_HOST=elasticsearch ELASTICSEARCH_PORT=9200 ES_BULK_REFRESH_POLICY=WAIT_UNTIL -ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true GRAPH_SERVICE_DIFF_MODE_ENABLED=true GRAPH_SERVICE_IMPL=elasticsearch JAVA_OPTS=-Xms1g -Xmx1g diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env index ee945c9c6db455..411544c6cae69f 100644 --- a/docker/datahub-gms/env/docker.env +++ b/docker/datahub-gms/env/docker.env @@ -14,7 +14,6 @@ NEO4J_USERNAME=neo4j NEO4J_PASSWORD=datahub JAVA_OPTS=-Xms1g -Xmx1g ES_BULK_REFRESH_POLICY=WAIT_UNTIL -ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true GRAPH_SERVICE_DIFF_MODE_ENABLED=true GRAPH_SERVICE_IMPL=neo4j ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/datahub-mce-consumer/env/docker-without-neo4j.env b/docker/datahub-mce-consumer/env/docker-without-neo4j.env new file mode 100644 index 00000000000000..e70e153a740d26 --- /dev/null +++ b/docker/datahub-mce-consumer/env/docker-without-neo4j.env @@ -0,0 +1,47 @@ +MCE_CONSUMER_ENABLED=true +EBEAN_DATASOURCE_USERNAME=datahub +EBEAN_DATASOURCE_PASSWORD=datahub +EBEAN_DATASOURCE_HOST=mysql:3306 +EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 +EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver +KAFKA_BOOTSTRAP_SERVER=broker:29092 +KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 +ELASTICSEARCH_HOST=elasticsearch +ELASTICSEARCH_PORT=9200 +ES_BULK_REFRESH_POLICY=WAIT_UNTIL +GRAPH_SERVICE_DIFF_MODE_ENABLED=true +GRAPH_SERVICE_IMPL=elasticsearch +JAVA_OPTS=-Xms1g -Xmx1g +ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml +DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} +DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} +DATAHUB_SYSTEM_CLIENT_ID=__datahub_system +DATAHUB_SYSTEM_CLIENT_SECRET=JohnSnowKnowsNothing +ENTITY_SERVICE_ENABLE_RETENTION=true +MAE_CONSUMER_ENABLED=false +PE_CONSUMER_ENABLED=false +UI_INGESTION_ENABLED=false + +# Uncomment to configure kafka topic names +# Make sure these names are consistent across the whole deployment +# METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 +# FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME=FailedMetadataChangeProposal_v1 +# Deprecated! +# METADATA_CHANGE_EVENT_NAME=MetadataChangeEvent_v4 +# FAILED_METADATA_CHANGE_EVENT_NAME=FailedMetadataChangeEvent_v4 + +# Uncomment and set these to support SSL connection to Elasticsearch +# ELASTICSEARCH_USE_SSL=true +# ELASTICSEARCH_SSL_PROTOCOL=TLSv1.2 +# ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL= +# ELASTICSEARCH_SSL_TRUSTSTORE_FILE= +# ELASTICSEARCH_SSL_TRUSTSTORE_TYPE= +# ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD= +# ELASTICSEARCH_SSL_KEYSTORE_FILE= +# ELASTICSEARCH_SSL_KEYSTORE_TYPE= +# ELASTICSEARCH_SSL_KEYSTORE_PASSWORD= + +# To use simple username/password authentication to Elasticsearch over HTTPS +# set ELASTICSEARCH_USE_SSL=true and uncomment: +# ELASTICSEARCH_USERNAME= +# ELASTICSEARCH_PASSWORD= diff --git a/docker/datahub-mce-consumer/env/docker.env b/docker/datahub-mce-consumer/env/docker.env index f11ce88f72cdda..311bc1636df3ba 100644 --- a/docker/datahub-mce-consumer/env/docker.env +++ b/docker/datahub-mce-consumer/env/docker.env @@ -10,11 +10,21 @@ ELASTICSEARCH_HOST=elasticsearch ELASTICSEARCH_PORT=9200 ES_BULK_REFRESH_POLICY=WAIT_UNTIL GRAPH_SERVICE_DIFF_MODE_ENABLED=true -GRAPH_SERVICE_IMPL=elasticsearch +GRAPH_SERVICE_IMPL=neo4j +NEO4J_HOST=http://neo4j:7474 +NEO4J_URI=bolt://neo4j +NEO4J_USERNAME=neo4j +NEO4J_PASSWORD=datahub JAVA_OPTS=-Xms1g -Xmx1g ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml DATAHUB_SYSTEM_CLIENT_ID=__datahub_system DATAHUB_SYSTEM_CLIENT_SECRET=JohnSnowKnowsNothing +DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} +DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} +ENTITY_SERVICE_ENABLE_RETENTION=true +MAE_CONSUMER_ENABLED=false +PE_CONSUMER_ENABLED=false +UI_INGESTION_ENABLED=false # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 777eb1df945fb8..2d5f3c7bec34b3 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -161,9 +161,6 @@ services: hostname: datahub-upgrade container_name: datahub-upgrade command: [ "-u", "BuildIndices" ] - depends_on: - - elasticsearch-setup - - kafka-setup networks: default: diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 82705228bd2a05..a525b890cf1b0f 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -124,9 +124,6 @@ services: hostname: datahub-upgrade container_name: datahub-upgrade command: ["-u", "BuildIndices"] - depends_on: - - elasticsearch-setup - - kafka-setup networks: default: diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml index d80beae8bfad35..ffe29ab2a59aa7 100644 --- a/docker/docker-compose.consumers-without-neo4j.yml +++ b/docker/docker-compose.consumers-without-neo4j.yml @@ -22,7 +22,7 @@ services: context: ../ dockerfile: docker/datahub-mce-consumer/Dockerfile image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:${DATAHUB_VERSION:-head} - env_file: datahub-mce-consumer/env/docker.env + env_file: datahub-mce-consumer/env/docker-without-neo4j.env hostname: datahub-mce-consumer container_name: datahub-mce-consumer ports: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 530d85166b15ba..56b7d0b7aa591f 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -136,9 +136,6 @@ services: hostname: datahub-upgrade container_name: datahub-upgrade command: ["-u", "BuildIndices"] - depends_on: - - elasticsearch-setup - - kafka-setup networks: diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh index 84b0364898a061..5e6865e6b2502e 100644 --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -63,6 +63,9 @@ fi cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 +# Create build indices topic with infinite retention +kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --config retention.ms=-1 --topic $BUILD_INDICES_HISTORY_TOPIC & + kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_AUDIT_EVENT_NAME & kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_CHANGE_EVENT_NAME & kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $FAILED_METADATA_CHANGE_EVENT_NAME & @@ -84,9 +87,6 @@ rc=$? if [ $rc -ne 0 ]; then exit $rc; fi echo "Finished topic creation group 2." -# Create build indices topic with infinite retention -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --config retention.ms=-1 --topic $BUILD_INDICES_HISTORY_TOPIC - # Create topic for datahub usage event if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $DATAHUB_USAGE_EVENT_NAME diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 7710785f73c17c..97efff012eeeaa 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -84,7 +84,6 @@ services: - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 17273943062655..952defaaf24018 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -86,7 +86,6 @@ services: - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g @@ -102,6 +101,27 @@ services: - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 volumes: - ${HOME}/.datahub/plugins:/etc/datahub/plugins + datahub-upgrade: + command: + - -u + - BuildIndices + container_name: datahub-upgrade + environment: + - EBEAN_DATASOURCE_USERNAME=datahub + - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - GRAPH_SERVICE_IMPL=elasticsearch + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml + hostname: datahub-upgrade + image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} elasticsearch: container_name: elasticsearch environment: diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index eb645a6abae2f9..614034a3447042 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -25,15 +25,13 @@ services: container_name: datahub-mce-consumer environment: - MCE_CONSUMER_ENABLED=true - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - EBEAN_DATASOURCE_USERNAME=datahub - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_HOST=mysql:3306 - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ES_BULK_REFRESH_POLICY=WAIT_UNTIL @@ -41,6 +39,8 @@ services: - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml + - DATAHUB_SERVER_TYPE=quickstart + - DATAHUB_TELEMETRY_ENABLED=true - DATAHUB_SYSTEM_CLIENT_ID=__datahub_system - DATAHUB_SYSTEM_CLIENT_SECRET=JohnSnowKnowsNothing - ENTITY_SERVICE_ENABLE_RETENTION=true diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index 7edfd52026473c..58f274ac02925f 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -31,28 +31,28 @@ services: container_name: datahub-mce-consumer environment: - MCE_CONSUMER_ENABLED=true - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - EBEAN_DATASOURCE_USERNAME=datahub - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_HOST=mysql:3306 - - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2 + - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true + - GRAPH_SERVICE_IMPL=neo4j - NEO4J_HOST=http://neo4j:7474 - NEO4J_URI=bolt://neo4j - NEO4J_USERNAME=neo4j - NEO4J_PASSWORD=datahub - JAVA_OPTS=-Xms1g -Xmx1g - - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - - GRAPH_SERVICE_IMPL=neo4j - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - DATAHUB_SYSTEM_CLIENT_ID=__datahub_system - DATAHUB_SYSTEM_CLIENT_SECRET=JohnSnowKnowsNothing + - DATAHUB_SERVER_TYPE=quickstart + - DATAHUB_TELEMETRY_ENABLED=true - ENTITY_SERVICE_ENABLE_RETENTION=true - MAE_CONSUMER_ENABLED=false - PE_CONSUMER_ENABLED=false diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index c1bfed8bd32a1e..fe253edcd09570 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -88,7 +88,6 @@ services: - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - NEO4J_HOST=http://neo4j:7474 - NEO4J_URI=bolt://neo4j - NEO4J_USERNAME=neo4j @@ -110,6 +109,27 @@ services: volumes: - ${HOME}/.datahub/plugins/:/etc/datahub/plugins - ${HOME}/.datahub/plugins/auth/resources/:/etc/datahub/plugins/auth/resources + datahub-upgrade: + command: + - -u + - BuildIndices + container_name: datahub-upgrade + environment: + - EBEAN_DATASOURCE_USERNAME=datahub + - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - GRAPH_SERVICE_IMPL=elasticsearch + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml + hostname: datahub-upgrade + image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} elasticsearch: container_name: elasticsearch environment: diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index 5c7a86357118ed..b7d0dee8720c53 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -10,6 +10,7 @@ "datahub-gms", "datahub-frontend-react", "kafka-setup", + "datahub-upgrade", "schema-registry", "broker", "zookeeper", @@ -26,6 +27,7 @@ "kafka-setup", "elasticsearch-setup", "mysql-setup", + "datahub-upgrade" ] CONTAINERS_TO_CHECK_IF_PRESENT = [ From 095e04cc04a16852483363436459663d45142489 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Thu, 29 Dec 2022 17:04:17 -0600 Subject: [PATCH 02/12] Fix registry for datahub-upgrade --- datahub-upgrade/build.gradle | 1 + docker/docker-compose-with-cassandra.yml | 2 +- docker/docker-compose-without-neo4j.yml | 2 +- docker/docker-compose.yml | 2 +- ...er-compose-without-neo4j-m1.quickstart.yml | 21 +++++++++++++++++++ ...ocker-compose-without-neo4j.quickstart.yml | 2 +- .../quickstart/docker-compose.quickstart.yml | 2 +- 7 files changed, 27 insertions(+), 5 deletions(-) diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 51535d0019eda7..03b5a0cec1aa60 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -5,6 +5,7 @@ plugins { } ext { + docker_registry = 'acryldata' docker_repo = 'datahub-upgrade' } diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 2d5f3c7bec34b3..34f712fb510379 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -156,7 +156,7 @@ services: build: context: ../ dockerfile: docker/datahub-upgrade/Dockerfile - image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} env_file: datahub-upgrade/env/docker-without-neo4j.env hostname: datahub-upgrade container_name: datahub-upgrade diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index a525b890cf1b0f..552862767c2942 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -119,7 +119,7 @@ services: build: context: ../ dockerfile: docker/datahub-upgrade/Dockerfile - image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} env_file: datahub-upgrade/env/docker-without-neo4j.env hostname: datahub-upgrade container_name: datahub-upgrade diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 56b7d0b7aa591f..f1739e950ee2f5 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -131,7 +131,7 @@ services: build: context: ../ dockerfile: docker/datahub-upgrade/Dockerfile - image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} env_file: datahub-upgrade/env/docker-without-neo4j.env hostname: datahub-upgrade container_name: datahub-upgrade diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 97efff012eeeaa..56d97b97e05ceb 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -99,6 +99,27 @@ services: - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 volumes: - ${HOME}/.datahub/plugins:/etc/datahub/plugins + datahub-upgrade: + command: + - -u + - BuildIndices + container_name: datahub-upgrade + environment: + - EBEAN_DATASOURCE_USERNAME=datahub + - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - GRAPH_SERVICE_IMPL=elasticsearch + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml + hostname: datahub-upgrade + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} elasticsearch: container_name: elasticsearch environment: diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 952defaaf24018..b77b0dd2be6145 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -121,7 +121,7 @@ services: - DATAHUB_GMS_PORT=8080 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml hostname: datahub-upgrade - image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} elasticsearch: container_name: elasticsearch environment: diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index fe253edcd09570..8b33c7ce4dbc78 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -129,7 +129,7 @@ services: - DATAHUB_GMS_PORT=8080 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml hostname: datahub-upgrade - image: ${DATAHUB_UPGRADE_IMAGE:-linkedin/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} elasticsearch: container_name: elasticsearch environment: From c1e7d72b889dfe08e9325f89a79f6f70dd697062 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Thu, 29 Dec 2022 17:30:40 -0600 Subject: [PATCH 03/12] add missing version in datahub-upgrade --- datahub-upgrade/build.gradle | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 03b5a0cec1aa60..7e2a4a75c43061 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -4,6 +4,8 @@ plugins { id 'com.palantir.docker' } +apply from: "../gradle/versioning/versioning.gradle" + ext { docker_registry = 'acryldata' docker_repo = 'datahub-upgrade' From 369b0bf208c7688e3676985b2b9baf39a5a44486 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Thu, 29 Dec 2022 18:36:51 -0600 Subject: [PATCH 04/12] Misc updates --- .../datahub/upgrade/buildindices/PreConfigureESStep.java | 2 +- datahub-web-react/build.gradle | 2 +- docker/elasticsearch/env/docker.env | 2 +- metadata-ingestion/junit.quick.xml | 1 - metadata-ingestion/src/datahub/cli/docker_check.py | 2 +- .../src/test/java/com/linkedin/metadata/ESTestUtils.java | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) delete mode 100644 metadata-ingestion/junit.quick.xml diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java index 09c864d36adfde..eb4e627b1eb038 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java @@ -71,7 +71,7 @@ public Function executable() { // Clone indices String clonedName = indexName + "_clone_" + System.currentTimeMillis(); - ResizeRequest resizeRequest = new ResizeRequest(indexName, clonedName); + ResizeRequest resizeRequest = new ResizeRequest(clonedName, indexName); boolean cloneAck = _esComponents.getSearchClient().indices().clone(resizeRequest, RequestOptions.DEFAULT).isAcknowledged(); log.info("Cloned index {} into {}, Acknowledged: {}", indexName, clonedName, cloneAck); if (!cloneAck) { diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index e63a911d236cf4..efbdba6b584f97 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -12,7 +12,7 @@ node { if (project.hasProperty('useSystemNode')) { download = ! project.getProperty('useSystemNode').toBoolean() } else { - download = false + download = true } // Version of node to use. diff --git a/docker/elasticsearch/env/docker.env b/docker/elasticsearch/env/docker.env index 038469425c25d0..7bb05e926f3896 100644 --- a/docker/elasticsearch/env/docker.env +++ b/docker/elasticsearch/env/docker.env @@ -1 +1 @@ -ES_JAVA_OPTS="-Xms256m -Xmx256m -Dlog4j2.formatMsgNoLookups=true" +ES_JAVA_OPTS="-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true" diff --git a/metadata-ingestion/junit.quick.xml b/metadata-ingestion/junit.quick.xml deleted file mode 100644 index 1431e2cfebf26f..00000000000000 --- a/metadata-ingestion/junit.quick.xml +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index b7d0dee8720c53..e43caa89c02e7e 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -27,7 +27,7 @@ "kafka-setup", "elasticsearch-setup", "mysql-setup", - "datahub-upgrade" + "datahub-upgrade", ] CONTAINERS_TO_CHECK_IF_PRESENT = [ diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index d13685dfa866d3..5d74fa07e7b18e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -54,7 +54,7 @@ private ESTestUtils() { static { ES_CONTAINER = new ElasticsearchContainer(DOCKER_IMAGE_NAME); checkContainerEngine(ES_CONTAINER.getDockerClient()); - ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx200m -XX:MaxDirectMemorySize=134217728") + ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx200m -XX:MaxDirectMemorySize=268435456") .withStartupTimeout(Duration.ofMinutes(5)); // usually < 1min } From 9b030c0617a48d4ae71c5d22be77e9e5cde2182e Mon Sep 17 00:00:00 2001 From: David Leifker Date: Thu, 29 Dec 2022 18:39:34 -0600 Subject: [PATCH 05/12] quickstart-update --- docker/quickstart/docker-compose-without-neo4j.quickstart.yml | 2 +- docker/quickstart/docker-compose.quickstart.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index b77b0dd2be6145..26cdc85f2340fa 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -127,7 +127,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx256m -Dlog4j2.formatMsgNoLookups=true + - ES_JAVA_OPTS=-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 8b33c7ce4dbc78..54aa8275985f7c 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -135,7 +135,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx256m -Dlog4j2.formatMsgNoLookups=true + - ES_JAVA_OPTS=-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m From e27c09c747320e360ae58a3f23fa1ef37e7d6819 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Fri, 30 Dec 2022 18:07:47 -0600 Subject: [PATCH 06/12] refactor(reindexing): refactor reindexing logic --- .../upgrade/buildindices/BuildIndices.java | 31 ++- .../buildindices/BuildIndicesStep.java | 19 +- .../upgrade/buildindices/IndexUtils.java | 37 +-- .../buildindices/PostBuildIndicesStep.java | 18 +- .../buildindices/PreConfigureESStep.java | 23 +- .../upgrade/config/BuildIndicesConfig.java | 5 +- docker/elasticsearch/env/docker.env | 2 +- docker/kafka-setup/Dockerfile | 5 +- docker/kafka-setup/kafka-config.sh | 14 + docker/kafka-setup/kafka-setup.sh | 127 ++++++--- docker/kafka-setup/kafka-topic-workers.sh | 65 +++++ ...er-compose-without-neo4j-m1.quickstart.yml | 2 +- ...ocker-compose-without-neo4j.quickstart.yml | 2 +- .../quickstart/docker-compose.quickstart.yml | 2 +- .../elastic/ElasticSearchGraphService.java | 22 +- .../elasticsearch/ElasticSearchService.java | 18 +- .../indexbuilder/ESIndexBuilder.java | 246 +++++------------- .../indexbuilder/EntityIndexBuilder.java | 18 +- .../indexbuilder/EntityIndexBuilders.java | 36 ++- .../indexbuilder/MappingsBuilder.java | 5 +- .../indexbuilder/ReindexConfig.java | 220 ++++++++++++++++ .../indexbuilder/SettingsBuilder.java | 2 + .../metadata/shared/ElasticSearchIndexed.java | 21 ++ .../ElasticSearchSystemMetadataService.java | 20 +- .../ElasticSearchTimeseriesAspectService.java | 17 +- .../TimeseriesAspectIndexBuilders.java | 50 ++-- .../metadata/ESSampleDataFixture.java | 2 +- .../metadata/ESSearchLineageFixture.java | 2 +- 28 files changed, 696 insertions(+), 335 deletions(-) create mode 100644 docker/kafka-setup/kafka-config.sh create mode 100644 docker/kafka-setup/kafka-topic-workers.sh create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/shared/ElasticSearchIndexed.java diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndices.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndices.java index 427f54489afecf..ab2c4ace322a6b 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndices.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndices.java @@ -8,14 +8,17 @@ import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; import com.linkedin.metadata.graph.GraphService; -import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.EntitySearchService; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.metadata.version.GitVersion; import com.linkedin.mxe.TopicConvention; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.apache.avro.generic.IndexedRecord; import org.apache.kafka.clients.producer.Producer; @@ -27,11 +30,17 @@ public class BuildIndices implements Upgrade { public BuildIndices(final SystemMetadataService systemMetadataService, final TimeseriesAspectService timeseriesAspectService, final EntitySearchService entitySearchService, final GraphService graphService, final BaseElasticSearchComponentsFactory.BaseElasticSearchComponents baseElasticSearchComponents, - final EntityRegistry entityRegistry, final Producer producer, + final Producer producer, final TopicConvention convention, final GitVersion gitVersion, final KafkaHealthChecker kafkaHealthChecker) { - final KafkaEventProducer kafkaEventProducer = new KafkaEventProducer(producer, convention, kafkaHealthChecker); - _steps = buildSteps(systemMetadataService, timeseriesAspectService, entitySearchService, graphService, - baseElasticSearchComponents, entityRegistry, kafkaEventProducer, gitVersion); + final KafkaEventProducer kafkaEventProducer = new KafkaEventProducer(producer, convention, kafkaHealthChecker); + + List indexedServices = Stream.of( + graphService, entitySearchService, systemMetadataService, timeseriesAspectService) + .filter(service -> service instanceof ElasticSearchIndexed) + .map(service -> (ElasticSearchIndexed) service) + .collect(Collectors.toList()); + + _steps = buildSteps(indexedServices, baseElasticSearchComponents, kafkaEventProducer, gitVersion); } @Override @@ -44,17 +53,17 @@ public List steps() { return _steps; } - private List buildSteps(final SystemMetadataService systemMetadataService, final TimeseriesAspectService - timeseriesAspectService, final EntitySearchService entitySearchService, final GraphService graphService, + private List buildSteps(final List indexedServices, final BaseElasticSearchComponentsFactory.BaseElasticSearchComponents baseElasticSearchComponents, - final EntityRegistry entityRegistry, final KafkaEventProducer eventProducer, final GitVersion gitVersion) { + final KafkaEventProducer eventProducer, final GitVersion gitVersion) { + final List steps = new ArrayList<>(); // Disable ES write mode/change refresh rate and clone indices - steps.add(new PreConfigureESStep(baseElasticSearchComponents, entityRegistry)); + steps.add(new PreConfigureESStep(baseElasticSearchComponents, indexedServices)); // Configure graphService, entitySearchService, systemMetadataService, timeseriesAspectService - steps.add(new BuildIndicesStep(graphService, entitySearchService, systemMetadataService, timeseriesAspectService)); + steps.add(new BuildIndicesStep(indexedServices)); // Reset configuration (and delete clones? Or just do this regularly? Or delete clone in pre-configure step if it already exists? - steps.add(new PostBuildIndicesStep(baseElasticSearchComponents, entityRegistry, eventProducer, gitVersion)); + steps.add(new PostBuildIndicesStep(baseElasticSearchComponents, indexedServices, eventProducer, gitVersion)); return steps; } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndicesStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndicesStep.java index a876ae8475787f..c37bbfebfb6c91 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndicesStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/BuildIndicesStep.java @@ -4,10 +4,9 @@ import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; -import com.linkedin.metadata.graph.GraphService; -import com.linkedin.metadata.search.EntitySearchService; -import com.linkedin.metadata.systemmetadata.SystemMetadataService; -import com.linkedin.metadata.timeseries.TimeseriesAspectService; +import com.linkedin.metadata.shared.ElasticSearchIndexed; + +import java.util.List; import java.util.function.Function; import lombok.RequiredArgsConstructor; @@ -15,10 +14,7 @@ @RequiredArgsConstructor public class BuildIndicesStep implements UpgradeStep { - private final GraphService _graphService; - private final EntitySearchService _entitySearchService; - private final SystemMetadataService _systemMetadataService; - private final TimeseriesAspectService _timeseriesAspectService; + private final List _services; @Override public String id() { @@ -34,10 +30,9 @@ public int retryCount() { public Function executable() { return (context) -> { try { - _graphService.configure(); - _entitySearchService.configure(); - _systemMetadataService.configure(); - _timeseriesAspectService.configure(); + for (ElasticSearchIndexed service : _services) { + service.reindexAll(); + } } catch (Exception e) { return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.FAILED); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/IndexUtils.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/IndexUtils.java index d2ddcb6adf052b..d243661806d24b 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/IndexUtils.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/IndexUtils.java @@ -1,11 +1,9 @@ package com.linkedin.datahub.upgrade.buildindices; -import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; -import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; -import com.linkedin.metadata.models.AspectSpec; -import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; +import com.linkedin.metadata.shared.ElasticSearchIndexed; + +import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -13,27 +11,18 @@ public class IndexUtils { private IndexUtils() { } - private static List _indexNames = new ArrayList<>(); + private static List _reindexConfigs = new ArrayList<>(); - public static List getAllIndexNames(BaseElasticSearchComponentsFactory.BaseElasticSearchComponents esComponents, - EntityRegistry entityRegistry) { + public static List getAllReindexConfigs(List elasticSearchIndexedList) throws IOException { // Avoid locking & reprocessing - List indexNames = new ArrayList<>(_indexNames); - if (indexNames.isEmpty()) { - IndexConvention indexConvention = esComponents.getIndexConvention(); - indexNames.add(indexConvention.getIndexName(ElasticSearchGraphService.INDEX_NAME)); - entityRegistry.getEntitySpecs().values().forEach(entitySpec -> { - indexNames.add(indexConvention.getEntityIndexName(entitySpec.getName())); - entitySpec.getAspectSpecs() - .stream() - .filter(AspectSpec::isTimeseries) - .forEach(aspectSpec -> - indexNames.add(indexConvention.getTimeseriesAspectIndexName(entitySpec.getName(), aspectSpec.getName()))); - }); - indexNames.add(indexConvention.getIndexName(ElasticSearchSystemMetadataService.INDEX_NAME)); - _indexNames = new ArrayList<>(indexNames); + List reindexConfigs = new ArrayList<>(_reindexConfigs); + if (reindexConfigs.isEmpty()) { + for (ElasticSearchIndexed elasticSearchIndexed : elasticSearchIndexedList) { + reindexConfigs.addAll(elasticSearchIndexed.getReindexConfigs()); + } + _reindexConfigs = new ArrayList<>(reindexConfigs); } - return indexNames; + return reindexConfigs; } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PostBuildIndicesStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PostBuildIndicesStep.java index c811c2b02d612f..bcf461119c9351 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PostBuildIndicesStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PostBuildIndicesStep.java @@ -7,12 +7,15 @@ import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; import com.linkedin.metadata.dao.producer.KafkaEventProducer; -import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.version.GitVersion; import com.linkedin.mxe.BuildIndicesHistoryEvent; import java.util.List; import java.util.Map; import java.util.function.Function; +import java.util.stream.Collectors; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; @@ -26,7 +29,7 @@ public class PostBuildIndicesStep implements UpgradeStep { private final BaseElasticSearchComponentsFactory.BaseElasticSearchComponents _esComponents; - private final EntityRegistry _entityRegistry; + private final List _services; private final KafkaEventProducer _kafkaEventProducer; private final GitVersion _gitVersion; @@ -44,17 +47,20 @@ public int retryCount() { public Function executable() { return (context) -> { try { - List indexNames = getAllIndexNames(_esComponents, _entityRegistry); + + List indexConfigs = getAllReindexConfigs(_services) + .stream().filter(ReindexConfig::requiresReindex) + .collect(Collectors.toList()); // Reset write blocking - for (String indexName : indexNames) { - UpdateSettingsRequest request = new UpdateSettingsRequest(indexName); + for (ReindexConfig indexConfig : indexConfigs) { + UpdateSettingsRequest request = new UpdateSettingsRequest(indexConfig.name()); Map indexSettings = ImmutableMap.of("index.blocks.write", "false"); request.settings(indexSettings); boolean ack = _esComponents.getSearchClient().indices().putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); - log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, indexSettings, ack); + log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexConfig.name(), indexSettings, ack); if (!ack) { log.error( "Partial index settings update, some indices may still be blocking writes." diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java index eb4e627b1eb038..c0954425ca079a 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java @@ -6,10 +6,13 @@ import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; -import com.linkedin.metadata.models.registry.EntityRegistry; import java.util.List; import java.util.Map; import java.util.function.Function; +import java.util.stream.Collectors; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.ElasticsearchStatusException; @@ -25,7 +28,7 @@ public class PreConfigureESStep implements UpgradeStep { private final BaseElasticSearchComponentsFactory.BaseElasticSearchComponents _esComponents; - private final EntityRegistry _entityRegistry; + private final List _services; @Override public String id() { @@ -42,10 +45,12 @@ public Function executable() { return (context) -> { try { // Get indices to update - List indexNames = getAllIndexNames(_esComponents, _entityRegistry); + List indexConfigs = getAllReindexConfigs(_services) + .stream().filter(ReindexConfig::requiresReindex) + .collect(Collectors.toList()); - for (String indexName : indexNames) { - UpdateSettingsRequest request = new UpdateSettingsRequest(indexName); + for (ReindexConfig indexConfig : indexConfigs) { + UpdateSettingsRequest request = new UpdateSettingsRequest(indexConfig.name()); Map indexSettings = ImmutableMap.of("index.blocks.write", "true"); request.settings(indexSettings); @@ -62,7 +67,7 @@ public Function executable() { } throw ese; } - log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, indexSettings, ack); + log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexConfig.name(), indexSettings, ack); if (!ack) { log.error("Partial index settings update, some indices may still be blocking writes." + " Please fix the error and re-run the BuildIndices upgrade job."); @@ -70,10 +75,10 @@ public Function executable() { } // Clone indices - String clonedName = indexName + "_clone_" + System.currentTimeMillis(); - ResizeRequest resizeRequest = new ResizeRequest(clonedName, indexName); + String clonedName = indexConfig.name() + "_clone_" + System.currentTimeMillis(); + ResizeRequest resizeRequest = new ResizeRequest(clonedName, indexConfig.name()); boolean cloneAck = _esComponents.getSearchClient().indices().clone(resizeRequest, RequestOptions.DEFAULT).isAcknowledged(); - log.info("Cloned index {} into {}, Acknowledged: {}", indexName, clonedName, cloneAck); + log.info("Cloned index {} into {}, Acknowledged: {}", indexConfig.name(), clonedName, cloneAck); if (!cloneAck) { log.error("Partial index settings update, cloned indices may need to be cleaned up: {}", clonedName); return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.FAILED); diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/BuildIndicesConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/BuildIndicesConfig.java index c494069d29d9a3..f4b68526609609 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/BuildIndicesConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/BuildIndicesConfig.java @@ -4,7 +4,6 @@ import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; import com.linkedin.metadata.graph.GraphService; -import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.timeseries.TimeseriesAspectService; @@ -22,10 +21,10 @@ public class BuildIndicesConfig { public BuildIndices buildIndices(final SystemMetadataService systemMetadataService, final TimeseriesAspectService timeseriesAspectService, final EntitySearchService entitySearchService, final GraphService graphService, final BaseElasticSearchComponentsFactory.BaseElasticSearchComponents baseElasticSearchComponents, - final EntityRegistry entityRegistry, final Producer producer, + final Producer producer, final TopicConvention convention, final GitVersion gitVersion, final KafkaHealthChecker kafkaHealthChecker) { return new BuildIndices(systemMetadataService, timeseriesAspectService, entitySearchService, graphService, - baseElasticSearchComponents, entityRegistry, producer, convention, gitVersion, kafkaHealthChecker); + baseElasticSearchComponents, producer, convention, gitVersion, kafkaHealthChecker); } } diff --git a/docker/elasticsearch/env/docker.env b/docker/elasticsearch/env/docker.env index 7bb05e926f3896..4b1f0215ea6c8d 100644 --- a/docker/elasticsearch/env/docker.env +++ b/docker/elasticsearch/env/docker.env @@ -1 +1 @@ -ES_JAVA_OPTS="-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true" +ES_JAVA_OPTS="-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true" diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 7dab497428a98f..5448af50eb0375 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -59,6 +59,9 @@ ENV PLATFORM_EVENT_TOPIC_NAME="PlatformEvent_v1" ENV BUILD_INDICES_HISTORY_TOPIC="BuildIndicesHistory_v1" COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh -RUN chmod +x ./kafka-setup.sh +COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh +COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh + +RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh CMD ./kafka-setup.sh diff --git a/docker/kafka-setup/kafka-config.sh b/docker/kafka-setup/kafka-config.sh new file mode 100644 index 00000000000000..2ba8e2d7c5d47c --- /dev/null +++ b/docker/kafka-setup/kafka-config.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +: ${PARTITIONS:=1} +: ${REPLICATION_FACTOR:=1} + +: ${KAFKA_PROPERTIES_SECURITY_PROTOCOL:=PLAINTEXT} + +: ${DATAHUB_ANALYTICS_ENABLED:=true} + +export KAFKA_HEAP_OPTS="-Xmx64M" + +CONNECTION_PROPERTIES_PATH=/tmp/connection.properties + +WORKERS=4 diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh index 5e6865e6b2502e..3b5a1deedf81a1 100644 --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -1,26 +1,6 @@ #!/bin/bash -: ${PARTITIONS:=1} -: ${REPLICATION_FACTOR:=1} -: ${KAFKA_PROPERTIES_SECURITY_PROTOCOL:=PLAINTEXT} - -: ${DATAHUB_ANALYTICS_ENABLED:=true} - -: ${KAFKA_HEAP_OPTS:=-Xmx64M} - -CONNECTION_PROPERTIES_PATH=/tmp/connection.properties - -function wait_ex { - # this waits for all jobs and returns the exit code of the last failing job - ecode=0 - while true; do - [ -z "$(jobs)" ] && break - wait -n - err="$?" - [ "$err" != "0" ] && ecode="$err" - done - return $ecode -} +. kafka-config.sh echo "bootstrap.servers=$KAFKA_BOOTSTRAP_SERVER" > $CONNECTION_PROPERTIES_PATH echo "security.protocol=$KAFKA_PROPERTIES_SECURITY_PROTOCOL" >> $CONNECTION_PROPERTIES_PATH @@ -63,33 +43,96 @@ fi cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 + +############################################################ +# Start Topic Creation Logic +############################################################ +# make the files +START=$(mktemp -t start-XXXX) +FIFO=$(mktemp -t fifo-XXXX) +FIFO_LOCK=$(mktemp -t lock-XXXX) +START_LOCK=$(mktemp -t lock-XXXX) + +## mktemp makes a regular file. Delete that an make a fifo. +rm $FIFO +mkfifo $FIFO +echo $FIFO + +## create a trap to cleanup on exit if we fail in the middle. +cleanup() { + rm $FIFO + rm $START + rm $FIFO_LOCK + rm $START_LOCK +} +trap cleanup 0 + +# Start worker script +. kafka-topic-workers.sh $START $FIFO $FIFO_LOCK $START_LOCK + +## Open the fifo for writing. +exec 3>$FIFO +## Open the start lock for reading +exec 4<$START_LOCK + +## Wait for the workers to start +while true; do + flock 4 + started=$(wc -l $START | cut -d \ -f 1) + flock -u 4 + if [[ $started -eq $WORKERS ]]; then + break + else + echo waiting, started $started of $WORKERS + fi +done +exec 4<&- + +## utility function to send the jobs to the workers +send() { + work_id=$1 + topic_args=$2 + echo sending $work_id $topic_args + echo "$work_id" "$topic_args" 1>&3 ## the fifo is fd 3 +} + +## Produce the jobs to run. + # Create build indices topic with infinite retention -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --config retention.ms=-1 --topic $BUILD_INDICES_HISTORY_TOPIC & - -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_AUDIT_EVENT_NAME & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_CHANGE_EVENT_NAME & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $FAILED_METADATA_CHANGE_EVENT_NAME & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_CHANGE_LOG_VERSIONED_TOPIC & -echo "Waiting for topic creation group 1." -result=$(wait_ex) -rc=$? -if [ $rc -ne 0 ]; then exit $rc; fi -echo "Finished topic creation group 1." +send "$BUILD_INDICES_HISTORY_TOPIC" "--config retention.ms=-1 --topic $BUILD_INDICES_HISTORY_TOPIC" + +send "$METADATA_AUDIT_EVENT_NAME" "--topic $METADATA_AUDIT_EVENT_NAME" +send "$METADATA_CHANGE_EVENT_NAME" "--topic $METADATA_CHANGE_EVENT_NAME" +send "$FAILED_METADATA_CHANGE_EVENT_NAME" "--topic $FAILED_METADATA_CHANGE_EVENT_NAME" +send "$METADATA_CHANGE_LOG_VERSIONED_TOPIC" "--topic $METADATA_CHANGE_LOG_VERSIONED_TOPIC" # Set retention to 90 days -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --config retention.ms=7776000000 --topic $METADATA_CHANGE_LOG_TIMESERIES_TOPIC & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $METADATA_CHANGE_PROPOSAL_TOPIC & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $FAILED_METADATA_CHANGE_PROPOSAL_TOPIC & -kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $PLATFORM_EVENT_TOPIC_NAME & -echo "Waiting for topic creation group 2." -result=$(wait_ex) -rc=$? -if [ $rc -ne 0 ]; then exit $rc; fi -echo "Finished topic creation group 2." +send "$METADATA_CHANGE_LOG_TIMESERIES_TOPIC" "--config retention.ms=7776000000 --topic $METADATA_CHANGE_LOG_TIMESERIES_TOPIC" +send "$METADATA_CHANGE_PROPOSAL_TOPIC" "--topic $METADATA_CHANGE_PROPOSAL_TOPIC" +send "$FAILED_METADATA_CHANGE_PROPOSAL_TOPIC" "--topic $FAILED_METADATA_CHANGE_PROPOSAL_TOPIC" +send "$PLATFORM_EVENT_TOPIC_NAME" "--topic $PLATFORM_EVENT_TOPIC_NAME" # Create topic for datahub usage event if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then - kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR --topic $DATAHUB_USAGE_EVENT_NAME + send "$DATAHUB_USAGE_EVENT_NAME" "--topic $DATAHUB_USAGE_EVENT_NAME" fi +## close the filo +exec 3<&- +## disable the cleanup trap +trap '' 0 +## It is safe to delete the files because the workers +## already opened them. Thus, only the names are going away +## the actual files will stay there until the workers +## all finish. +cleanup +## now wait for all the workers. +wait + +echo "Topic Creation Complete." + +############################################################ +# End Topic Creation Logic +############################################################ + kafka-configs.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --entity-type topics --entity-name _schemas --alter --add-config cleanup.policy=compact diff --git a/docker/kafka-setup/kafka-topic-workers.sh b/docker/kafka-setup/kafka-topic-workers.sh new file mode 100644 index 00000000000000..b132a04201c603 --- /dev/null +++ b/docker/kafka-setup/kafka-topic-workers.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +. kafka-config.sh + +START=$1 +FIFO=$2 +FIFO_LOCK=$3 +START_LOCK=$4 + +## this is the "job" function which is does whatever work +## the queue workers are supposed to be doing +job() { + i=$1 + topic_args=$2 + kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER \ + --partitions $PARTITIONS --replication-factor $REPLICATION_FACTOR \ + $topic_args +} + +## This is the worker to read from the queue. +work() { + ID=$1 + ## first open the fifo and locks for reading. + exec 3<$FIFO + exec 4<$FIFO_LOCK + exec 5<$START_LOCK + + ## signal the worker has started. + flock 5 # obtain the start lock + echo $ID >> $START # put my worker ID in the start file + flock -u 5 # release the start lock + exec 5<&- # close the start lock file + echo worker $ID started + + while true; do + ## try to read the queue + flock 4 # obtain the fifo lock + read -su 3 work_id work_item # read into work_id and work_item + read_status=$? # save the exit status of read + flock -u 4 # release the fifo lock + + ## check the line read. + if [[ $read_status -eq 0 ]]; then + ## If read gives an exit code of 0 the read succeeded. + # got a work item. do the work + echo $ID got work_id=$work_id topic_args=$work_item + ## Run the job in a subshell. That way any exit calls do not kill + ## the worker process. + ( job "$work_id" "$work_item" ) + else + ## Any other exit code indicates an EOF. + break + fi + done + # clean up the fd(s) + exec 3<&- + exec 4<&- + echo $ID "done working" +} + +## Start the workers. +for ((i=1;i<=$WORKERS;i++)); do + echo will start $i + work $i & +done diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 56d97b97e05ceb..67eaf39c0312d6 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -125,7 +125,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx256m -Dlog4j2.formatMsgNoLookups=true + - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 26cdc85f2340fa..7fb9d263094db1 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -127,7 +127,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true + - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 54aa8275985f7c..63b39e93bcc342 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -135,7 +135,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx386m -Dlog4j2.formatMsgNoLookups=true + - ES_JAVA_OPTS=-Xms256m -Xmx512m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index a2ab033d3edbd4..1d48214ce9bd6e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -23,7 +23,9 @@ import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; @@ -49,7 +51,7 @@ @Slf4j @RequiredArgsConstructor -public class ElasticSearchGraphService implements GraphService { +public class ElasticSearchGraphService implements GraphService, ElasticSearchIndexed { private final LineageRegistry _lineageRegistry; private final ESBulkProcessor _esBulkProcessor; @@ -276,13 +278,25 @@ public void removeEdgesFromNode( public void configure() { log.info("Setting up elastic graph index"); try { - _indexBuilder.buildIndex(_indexConvention.getIndexName(INDEX_NAME), - GraphRelationshipMappingsBuilder.getMappings(), Collections.emptyMap()); + for (ReindexConfig config : getReindexConfigs()) { + _indexBuilder.buildIndex(config); + } } catch (IOException e) { - e.printStackTrace(); + throw new RuntimeException(e); } } + @Override + public List getReindexConfigs() throws IOException { + return List.of(_indexBuilder.buildReindexState(_indexConvention.getIndexName(INDEX_NAME), + GraphRelationshipMappingsBuilder.getMappings(), Collections.emptyMap())); + } + + @Override + public void reindexAll() { + configure(); + } + @VisibleForTesting @Override public void clear() { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index adb529fdde1264..78347af4ff49c2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -8,23 +8,27 @@ import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.search.utils.SearchUtils; + import java.util.List; import java.util.Map; import java.util.Optional; import javax.annotation.Nonnull; import javax.annotation.Nullable; + +import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @Slf4j @RequiredArgsConstructor -public class ElasticSearchService implements EntitySearchService { +public class ElasticSearchService implements EntitySearchService, ElasticSearchIndexed { private static final int MAX_RUN_IDS_INDEXED = 25; // Save the previous 25 run ids in the index. private final EntityIndexBuilders indexBuilders; @@ -34,7 +38,17 @@ public class ElasticSearchService implements EntitySearchService { @Override public void configure() { - indexBuilders.buildAll(); + indexBuilders.reindexAll(); + } + + @Override + public List getReindexConfigs() { + return indexBuilders.getReindexConfigs(); + } + + @Override + public void reindexAll() { + configure(); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index 761233083e7c48..9a04f616626541 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -1,21 +1,11 @@ package com.linkedin.metadata.search.elasticsearch.indexbuilder; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.MapDifference; -import com.google.common.collect.Maps; import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; +import java.util.*; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; -import java.util.stream.Stream; import javax.annotation.Nonnull; import lombok.Getter; @@ -74,125 +64,99 @@ public class ESIndexBuilder { @Getter private final boolean enableIndexMappingsReindex; - private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - /* - Most index settings are default values and populated by Elastic. This list is an include list to determine which - settings we care about when a difference is present. - */ - private static final List SETTINGS_DYNAMIC = ImmutableList.of("number_of_replicas", "refresh_interval"); - // These setting require reindex - private static final List SETTINGS_STATIC = ImmutableList.of("number_of_shards"); - private static final List SETTINGS = Stream.concat( - SETTINGS_DYNAMIC.stream(), SETTINGS_STATIC.stream()).collect(Collectors.toList()); - - public void buildIndex(String indexName, Map mappings, Map settings) - throws IOException { - // Check if index exists - boolean exists = searchClient.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT); + public ReindexConfig buildReindexState(String indexName, Map mappings, Map settings) throws IOException { + ReindexConfig.ReindexConfigBuilder builder = ReindexConfig.builder() + .name(indexName) + .enableIndexSettingsReindex(enableIndexSettingsReindex) + .enableIndexMappingsReindex(enableIndexMappingsReindex) + .targetMappings(mappings); Map baseSettings = new HashMap<>(settings); baseSettings.put("number_of_shards", numShards); baseSettings.put("number_of_replicas", numReplicas); baseSettings.put("refresh_interval", String.format("%ss", refreshIntervalSeconds)); baseSettings.putAll(indexSettingOverrides.getOrDefault(indexName, Map.of())); - Map finalSettings = ImmutableMap.of("index", baseSettings); + Map targetSetting = ImmutableMap.of("index", baseSettings); + builder.targetSettings(targetSetting); - // If index doesn't exist, create index + // Check if index exists + boolean exists = searchClient.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT); + builder.exists(exists); + + // If index doesn't exist, no reindex if (!exists) { - createIndex(indexName, mappings, finalSettings); - return; + return builder.build(); } - Map oldMappings = searchClient.indices() - .getMapping(new GetMappingsRequest().indices(indexName), RequestOptions.DEFAULT) - .mappings() - .values() - .stream() - .findFirst() - .get() - .getSourceAsMap(); - - MapDifference mappingsDiff = Maps.difference( - (Map) oldMappings.getOrDefault("properties", Map.of()), - (Map) mappings.getOrDefault("properties", Map.of())); - - Settings oldSettings = searchClient.indices() - .getSettings(new GetSettingsRequest().indices(indexName), RequestOptions.DEFAULT) - .getIndexToSettings() - .valuesIt() - .next(); - - final boolean isAnalysisEqual = isAnalysisEqual(finalSettings, oldSettings); - final boolean isSettingsEqual = isSettingsEqual(finalSettings, oldSettings); - final boolean isSettingsReindexRequired = isSettingsReindexRequired(finalSettings, oldSettings); + Settings currentSettings = searchClient.indices() + .getSettings(new GetSettingsRequest().indices(indexName), RequestOptions.DEFAULT) + .getIndexToSettings() + .valuesIt() + .next(); + builder.currentSettings(currentSettings); + + Map currentMappings = searchClient.indices() + .getMapping(new GetMappingsRequest().indices(indexName), RequestOptions.DEFAULT) + .mappings() + .values() + .stream() + .findFirst() + .get() + .getSourceAsMap(); + builder.currentMappings(currentMappings); + + return builder.build(); + } - // If there are no updates to mappings and settings, return - if (mappingsDiff.areEqual() && isAnalysisEqual && isSettingsEqual) { - log.info("No updates to index {}", indexName); + public void buildIndex(String indexName, Map mappings, Map settings) throws IOException { + buildIndex(buildReindexState(indexName, mappings, settings)); + } + + public void buildIndex(ReindexConfig indexState) throws IOException { + // If index doesn't exist, create index + if (!indexState.exists()) { + createIndex(indexState.name(), indexState); return; } - // If there are no updates to settings, and there are only pure additions to mappings (no updates to existing fields), - // there is no need to reindex. Just update mappings - if (isAnalysisEqual && isPureAddition(mappingsDiff) && isSettingsEqual) { - log.info("New fields have been added to index {}. Updating index in place. Adding: {}", indexName, mappingsDiff); - PutMappingRequest request = new PutMappingRequest(indexName).source(mappings); - searchClient.indices().putMapping(request, RequestOptions.DEFAULT); - log.info("Updated index {} with new mappings", indexName); + // If there are no updates to mappings and settings, return + if (!indexState.requiresApplyMappings() && !indexState.requiresApplySettings()) { + log.info("No updates to index {}", indexState.name()); return; } - if (!mappingsDiff.entriesDiffering().isEmpty()) { - if (enableIndexMappingsReindex) { - log.info("There's diff between new mappings (left) and old mappings (right): {}", mappingsDiff); - reindex(indexName, mappings, finalSettings); - } else { - log.warn("There's diff between new mappings, however reindexing is DISABLED. (left) and old mappings (right): {}", mappingsDiff); - } - } else { - log.info("There's an update to settings"); - if (isSettingsReindexRequired) { - if (enableIndexSettingsReindex) { - log.info("There's an update to settings that requires reindexing. Target: {}", - OBJECT_MAPPER.writeValueAsString(finalSettings)); - reindex(indexName, mappings, finalSettings); - } else { - log.warn("There's an update to settings that requires reindexing, however reindexing is DISABLED. Existing: {} Target: {}", - oldSettings, OBJECT_MAPPER.writeValueAsString(finalSettings)); - } + if (!indexState.requiresReindex()) { + // no need to reindex and only new mappings or dynamic settings + + // Just update the additional mappings + if (indexState.isPureMappingsAddition()) { + log.info("Updating index {} mappings in place.", indexState.name()); + PutMappingRequest request = new PutMappingRequest(indexState.name()).source(indexState.targetMappings()); + searchClient.indices().putMapping(request, RequestOptions.DEFAULT); + log.info("Updated index {} with new mappings", indexState.name()); } - /* - If we allow reindexing, then any setting that doesn't require reindexing is also - applied above and our equality is out of date. We don't want to apply them again for no reason. - */ - boolean settingsApplied = isSettingsReindexRequired && enableIndexSettingsReindex; - if (!isSettingsEqual && !settingsApplied) { - UpdateSettingsRequest request = new UpdateSettingsRequest(indexName); - Map indexSettings = ((Map) finalSettings.get("index")) + if (indexState.requiresApplySettings()) { + UpdateSettingsRequest request = new UpdateSettingsRequest(indexState.name()); + Map indexSettings = ((Map) indexState.targetSettings().get("index")) .entrySet().stream() - .filter(e -> SETTINGS_DYNAMIC.contains(e.getKey())) + .filter(e -> ReindexConfig.SETTINGS_DYNAMIC.contains(e.getKey())) .collect(Collectors.toMap(e -> "index." + e.getKey(), Map.Entry::getValue)); + request.settings(indexSettings); - /* - We might not have any changes that can be applied without reindex. This is the case when a reindex - is needed due to a setting, but not allowed. We don't want to apply empty settings for no reason. - */ - if (!indexSettings.isEmpty()) { - request.settings(indexSettings); - boolean ack = searchClient.indices().putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); - log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, - OBJECT_MAPPER.writeValueAsString(indexSettings), ack); - } + boolean ack = searchClient.indices().putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); + log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexState.name(), + ReindexConfig.OBJECT_MAPPER.writeValueAsString(indexSettings), ack); } + } else { + reindex(indexState.name(), indexState); } } - private void reindex(String indexName, Map mappings, Map finalSettings) - throws IOException { + private void reindex(String indexName, ReindexConfig indexState) throws IOException { String tempIndexName = indexName + "_" + System.currentTimeMillis(); - createIndex(tempIndexName, mappings, finalSettings); + createIndex(tempIndexName, indexState); try { ReindexRequest reindexRequest = new ReindexRequest() .setSourceIndices(indexName) @@ -301,84 +265,12 @@ private long getCount(@Nonnull String indexName) throws IOException { .getCount(); } - private void createIndex(String indexName, Map mappings, Map settings) - throws IOException { + private void createIndex(String indexName, ReindexConfig state) throws IOException { log.info("Index {} does not exist. Creating", indexName); CreateIndexRequest createIndexRequest = new CreateIndexRequest(indexName); - createIndexRequest.mapping(mappings); - createIndexRequest.settings(settings); + createIndexRequest.mapping(state.targetMappings()); + createIndexRequest.settings(state.targetSettings()); searchClient.indices().create(createIndexRequest, RequestOptions.DEFAULT); log.info("Created index {}", indexName); } - - private static boolean isPureAddition(MapDifference mapDifference) { - return !mapDifference.areEqual() && mapDifference.entriesDiffering().isEmpty() - && !mapDifference.entriesOnlyOnRight().isEmpty(); - } - - private static boolean isAnalysisEqual(Map newSettings, Settings oldSettings) { - if (!newSettings.containsKey("index")) { - return true; - } - Map indexSettings = (Map) newSettings.get("index"); - if (!indexSettings.containsKey("analysis")) { - return true; - } - // Compare analysis section - Map newAnalysis = (Map) indexSettings.get("analysis"); - Settings oldAnalysis = oldSettings.getByPrefix("index.analysis."); - return equalsGroup(newAnalysis, oldAnalysis); - } - - private static boolean isSettingsEqual(Map newSettings, Settings oldSettings) { - if (!newSettings.containsKey("index")) { - return true; - } - Map indexSettings = (Map) newSettings.get("index"); - return SETTINGS.stream() - .allMatch(settingKey -> Objects.equals(indexSettings.get(settingKey).toString(), oldSettings.get("index." + settingKey))); - } - - private static boolean isSettingsReindexRequired(Map newSettings, Settings oldSettings) { - if (!newSettings.containsKey("index")) { - return false; - } - Map indexSettings = (Map) newSettings.get("index"); - - if (SETTINGS_STATIC.stream().anyMatch(settingKey -> - !Objects.equals(indexSettings.get(settingKey).toString(), oldSettings.get("index." + settingKey)))) { - return true; - } - - return indexSettings.containsKey("analysis") - && !equalsGroup((Map) indexSettings.get("analysis"), oldSettings.getByPrefix("index.analysis.")); - } - - private static boolean equalsGroup(Map newSettings, Settings oldSettings) { - if (!newSettings.keySet().equals(oldSettings.names())) { - return false; - } - - for (String key : newSettings.keySet()) { - // Skip urn stop filter, as adding new entities will cause this filter to change - // No need to reindex every time a new entity is added - if (key.equals("urn_stop_filter")) { - continue; - } - if (newSettings.get(key) instanceof Map) { - if (!equalsGroup((Map) newSettings.get(key), oldSettings.getByPrefix(key + "."))) { - return false; - } - } else if (newSettings.get(key) instanceof List) { - if (!newSettings.get(key).equals(oldSettings.getAsList(key))) { - return false; - } - } else { - if (!newSettings.get(key).toString().equals(oldSettings.get(key))) { - return false; - } - } - } - return true; - } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java index 6709a1160c03c2..04c9f1993ff352 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java @@ -2,24 +2,34 @@ import com.linkedin.metadata.models.EntitySpec; import java.io.IOException; +import java.util.List; import java.util.Map; + +import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @Slf4j @RequiredArgsConstructor -public class EntityIndexBuilder { +public class EntityIndexBuilder implements ElasticSearchIndexed { private final ESIndexBuilder indexBuilder; private final EntitySpec entitySpec; private final SettingsBuilder settingsBuilder; private final String indexName; - public void buildIndex() throws IOException { + @Override + public void reindexAll() throws IOException { log.info("Setting up index: {}", indexName); + for (ReindexConfig config : getReindexConfigs()) { + indexBuilder.buildIndex(config); + } + } + + @Override + public List getReindexConfigs() throws IOException { Map mappings = MappingsBuilder.getMappings(entitySpec); Map settings = settingsBuilder.getSettings(); - - indexBuilder.buildIndex(indexName, mappings, settings); + return List.of(indexBuilder.buildReindexState(indexName, mappings, settings)); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java index 349187bd347a37..dd63132bb8b1b0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java @@ -1,27 +1,43 @@ package com.linkedin.metadata.search.elasticsearch.indexbuilder; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + import lombok.RequiredArgsConstructor; @RequiredArgsConstructor -public class EntityIndexBuilders { +public class EntityIndexBuilders implements ElasticSearchIndexed { private final ESIndexBuilder indexBuilder; private final EntityRegistry entityRegistry; private final IndexConvention indexConvention; private final SettingsBuilder settingsBuilder; - public void buildAll() { - for (EntitySpec entitySpec : entityRegistry.getEntitySpecs().values()) { - try { - new EntityIndexBuilder(indexBuilder, entitySpec, settingsBuilder, - indexConvention.getIndexName(entitySpec)).buildIndex(); - } catch (IOException e) { - e.printStackTrace(); + @Override + public void reindexAll() { + for (ReindexConfig config : getReindexConfigs()) { + try { + indexBuilder.buildIndex(config); + } catch (IOException e) { + throw new RuntimeException(e); + } } - } + } + + @Override + public List getReindexConfigs() { + return entityRegistry.getEntitySpecs().values().stream().flatMap(entitySpec -> { + try { + return new EntityIndexBuilder(indexBuilder, entitySpec, settingsBuilder, indexConvention.getIndexName(entitySpec)) + .getReindexConfigs().stream(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + ).collect(Collectors.toList()); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index fda7e7c1127559..4774990d5121d3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -17,8 +17,9 @@ public class MappingsBuilder { public static final Map PARTIAL_NGRAM_CONFIG = ImmutableMap.of( - TYPE, "search_as_you_type", - MAX_SHINGLE_SIZE, "4"); + TYPE, "search_as_you_type", + MAX_SHINGLE_SIZE, "4", + DOC_VALUES, "false"); public static final Map KEYWORD_TYPE_MAP = ImmutableMap.of(TYPE, KEYWORD); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java new file mode 100644 index 00000000000000..9c02afba2167b8 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java @@ -0,0 +1,220 @@ +package com.linkedin.metadata.search.elasticsearch.indexbuilder; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.MapDifference; +import com.google.common.collect.Maps; +import lombok.Builder; +import lombok.Getter; +import lombok.experimental.Accessors; +import lombok.extern.slf4j.Slf4j; +import org.elasticsearch.common.settings.Settings; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +@Slf4j +@Builder +@Getter +@Accessors(fluent = true) +public class ReindexConfig { + public final static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + /* + Most index settings are default values and populated by Elastic. This list is an include list to determine which + settings we care about when a difference is present. + */ + public static final List SETTINGS_DYNAMIC = ImmutableList.of("number_of_replicas", "refresh_interval"); + // These setting require reindex + public static final List SETTINGS_STATIC = ImmutableList.of("number_of_shards"); + public static final List SETTINGS = Stream.concat( + SETTINGS_DYNAMIC.stream(), SETTINGS_STATIC.stream()).collect(Collectors.toList()); + + final private String name; + final private boolean exists; + final private Settings currentSettings; + final private Map targetSettings; + final private Map currentMappings; + final private Map targetMappings; + final private boolean enableIndexMappingsReindex; + final private boolean enableIndexSettingsReindex; + + /* Calculated */ + final private boolean requiresReindex; + final private boolean requiresApplySettings; + final private boolean requiresApplyMappings; + final private boolean isPureMappingsAddition; + final private boolean isSettingsReindex; + + public static ReindexConfigBuilder builder() { + return new CalculatedBuilder(); + } + + public static class ReindexConfigBuilder { + // hide calculated fields + private ReindexConfigBuilder requiresReindex(boolean ignored) { + return this; + } + private ReindexConfigBuilder requiresApplySettings(boolean ignored) { + return this; + } + private ReindexConfigBuilder requiresApplyMappings(boolean ignored) { + return this; + } + private ReindexConfigBuilder isPureMappingsAddition(boolean ignored) { + return this; + } + private ReindexConfigBuilder isSettingsReindexRequired(boolean ignored) { + return this; + } + + // ensure sorted + public ReindexConfigBuilder currentMappings(Map currentMappings) { + this.currentMappings = sortMap(currentMappings); + return this; + } + public ReindexConfigBuilder targetMappings(Map targetMappings) { + this.targetMappings = sortMap(targetMappings); + return this; + } + + private static TreeMap sortMap(Map input) { + return input.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> { + if (e.getValue() instanceof Map) { + return sortMap((Map) e.getValue()); + } else { + return String.valueOf(e.getValue()); + } + }, + (oldValue, newValue) -> newValue, TreeMap::new)); + } + } + + /** + * Implement calculated fields + */ + private static class CalculatedBuilder extends ReindexConfigBuilder { + @Override + public ReindexConfig build() { + if (super.exists) { + /* Consider mapping changes */ + MapDifference mappingsDiff = Maps.difference( + (TreeMap) super.currentMappings.getOrDefault("properties", new TreeMap()), + (TreeMap) super.targetMappings.getOrDefault("properties", new TreeMap())); + super.requiresApplyMappings = !mappingsDiff.entriesDiffering().isEmpty(); + super.isPureMappingsAddition = !mappingsDiff.areEqual() && mappingsDiff.entriesDiffering().isEmpty() + && !mappingsDiff.entriesOnlyOnRight().isEmpty(); + if (super.requiresApplyMappings && super.isPureMappingsAddition) { + log.info("Index: {} - New fields have been added to index. Adding: {}", + super.name, mappingsDiff.entriesDiffering()); + } else if (super.requiresApplyMappings) { + log.info("Index: {} - There's diff between new mappings (left) and old mappings (right): {}", + super.name, mappingsDiff.entriesDiffering()); + } + + /* Consider analysis and settings changes */ + super.requiresApplySettings = !isSettingsEqual() || !isAnalysisEqual(); + super.isSettingsReindex = isSettingsReindexRequired(); + + /* Determine reindexing required - some settings and mappings do not require reindex, analysis always does */ + if (super.requiresApplyMappings && !super.isPureMappingsAddition) { + if (super.enableIndexMappingsReindex) { + super.requiresReindex = true; + } else { + log.warn("Index: {} - There's diff between new mappings, however reindexing is DISABLED.", super.name); + } + } + if (super.isSettingsReindex) { + try { + if (!isAnalysisEqual()) { + log.info("Index: {} - There's an update to `analysis` settings that requires reindexing. Target: {} Current: {}", + super.name, OBJECT_MAPPER.writeValueAsString(super.targetSettings), super.currentSettings); + } + if (!isSettingsEqual()) { + log.info("Index: {} - There's an update to settings that requires reindexing. Target: {} Current: {}", + super.name, OBJECT_MAPPER.writeValueAsString(super.targetSettings), super.currentSettings); + } + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + if (super.enableIndexSettingsReindex) { + super.requiresReindex = true; + } else { + log.warn("Index: {} - There's an update to settings that requires reindexing, however reindexing is DISABLED", super.name); + } + } + } + return super.build(); + } + + private boolean isAnalysisEqual() { + if (!super.targetSettings.containsKey("index")) { + return true; + } + Map indexSettings = (Map) super.targetSettings.get("index"); + if (!indexSettings.containsKey("analysis")) { + return true; + } + // Compare analysis section + Map newAnalysis = (Map) indexSettings.get("analysis"); + Settings oldAnalysis = super.currentSettings.getByPrefix("index.analysis."); + return equalsGroup(newAnalysis, oldAnalysis); + } + + private boolean isSettingsEqual() { + if (!super.targetSettings.containsKey("index")) { + return true; + } + Map indexSettings = (Map) super.targetSettings.get("index"); + return SETTINGS.stream() + .allMatch(settingKey -> Objects.equals(indexSettings.get(settingKey).toString(), + super.currentSettings.get("index." + settingKey))); + } + + private boolean isSettingsReindexRequired() { + if (!super.targetSettings.containsKey("index")) { + return false; + } + Map indexSettings = (Map) super.targetSettings.get("index"); + + if (SETTINGS_STATIC.stream().anyMatch(settingKey -> + !Objects.equals(indexSettings.get(settingKey).toString(), super.currentSettings.get("index." + settingKey)))) { + return true; + } + + return indexSettings.containsKey("analysis") + && !equalsGroup((Map) indexSettings.get("analysis"), + super.currentSettings.getByPrefix("index.analysis.")); + } + } + + private static boolean equalsGroup(Map newSettings, Settings oldSettings) { + if (!newSettings.keySet().equals(oldSettings.names())) { + return false; + } + + for (String key : newSettings.keySet()) { + // Skip urn stop filter, as adding new entities will cause this filter to change + // No need to reindex every time a new entity is added + if (key.equals("urn_stop_filter")) { + continue; + } + if (newSettings.get(key) instanceof Map) { + if (!equalsGroup((Map) newSettings.get(key), oldSettings.getByPrefix(key + "."))) { + return false; + } + } else if (newSettings.get(key) instanceof List) { + if (!newSettings.get(key).equals(oldSettings.getAsList(key))) { + return false; + } + } else { + if (!newSettings.get(key).toString().equals(oldSettings.get(key))) { + return false; + } + } + } + return true; + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 8fd0ace9b5ccdd..db574cc9e93214 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -34,6 +34,8 @@ public class SettingsBuilder { public static final String MAX_NGRAM_DIFF = "max_ngram_diff"; public static final String MIN_SHINGLE_SIZE = "min_shingle_size"; public static final String MAX_SHINGLE_SIZE = "max_shingle_size"; + + public static final String DOC_VALUES = "doc_values"; public static final String NGRAM = "ngram"; public static final String NORMALIZER = "normalizer"; public static final String PATTERN = "pattern"; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/shared/ElasticSearchIndexed.java b/metadata-io/src/main/java/com/linkedin/metadata/shared/ElasticSearchIndexed.java new file mode 100644 index 00000000000000..c3ef7c2503cfd7 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/shared/ElasticSearchIndexed.java @@ -0,0 +1,21 @@ +package com.linkedin.metadata.shared; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; + +import java.io.IOException; +import java.util.List; + +public interface ElasticSearchIndexed { + /** + * The index configurations for the given service. + * @return List of reindex configurations + */ + List getReindexConfigs() throws IOException; + + /** + * Mirrors the service's functions which + * are expected to build/reindex as needed based + * on the reindex configurations above + */ + void reindexAll() throws IOException; +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index 7390b49b3fc1af..1585ec8ad6564f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -7,8 +7,10 @@ import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.mxe.SystemMetadata; import java.io.IOException; @@ -39,7 +41,7 @@ @Slf4j @RequiredArgsConstructor -public class ElasticSearchSystemMetadataService implements SystemMetadataService { +public class ElasticSearchSystemMetadataService implements SystemMetadataService, ElasticSearchIndexed { private final ESBulkProcessor _esBulkProcessor; private final IndexConvention _indexConvention; @@ -196,13 +198,25 @@ public List listRuns(Integer pageOffset, Integer pageSize, public void configure() { log.info("Setting up system metadata index"); try { - _indexBuilder.buildIndex(_indexConvention.getIndexName(INDEX_NAME), SystemMetadataMappingsBuilder.getMappings(), - Collections.emptyMap()); + for (ReindexConfig config : getReindexConfigs()) { + _indexBuilder.buildIndex(config); + } } catch (IOException ie) { throw new RuntimeException("Could not configure system metadata index", ie); } } + @Override + public List getReindexConfigs() throws IOException { + return List.of(_indexBuilder.buildReindexState(_indexConvention.getIndexName(INDEX_NAME), + SystemMetadataMappingsBuilder.getMappings(), Collections.emptyMap())); + } + + @Override + public void reindexAll() { + configure(); + } + @VisibleForTesting @Override public void clear() { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index e366fed67ecb1d..b8d86a31820577 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -15,9 +15,11 @@ import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.search.utils.QueryUtils; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.indexbuilder.MappingsBuilder; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; @@ -30,6 +32,7 @@ import com.linkedin.timeseries.DeleteAspectValuesResult; import com.linkedin.timeseries.GenericTable; import com.linkedin.timeseries.GroupingBucket; + import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; @@ -56,7 +59,7 @@ @Slf4j -public class ElasticSearchTimeseriesAspectService implements TimeseriesAspectService { +public class ElasticSearchTimeseriesAspectService implements TimeseriesAspectService, ElasticSearchIndexed { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String TIMESTAMP_FIELD = "timestampMillis"; private static final String EVENT_FIELD = "event"; @@ -111,7 +114,17 @@ private static EnvelopedAspect parseDocument(@Nonnull SearchHit doc) { @Override public void configure() { - _indexBuilders.buildAll(); + _indexBuilders.reindexAll(); + } + + @Override + public List getReindexConfigs() { + return _indexBuilders.getReindexConfigs(); + } + + @Override + public void reindexAll() { + configure(); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java index d0fd26d737cf0f..44db9696fa796b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java @@ -1,38 +1,54 @@ package com.linkedin.metadata.timeseries.elastic.indexbuilder; -import com.linkedin.metadata.models.AspectSpec; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ReindexConfig; +import com.linkedin.metadata.shared.ElasticSearchIndexed; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import com.linkedin.util.Pair; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @Slf4j @RequiredArgsConstructor -public class TimeseriesAspectIndexBuilders { +public class TimeseriesAspectIndexBuilders implements ElasticSearchIndexed { private final ESIndexBuilder _indexBuilder; private final EntityRegistry _entityRegistry; private final IndexConvention _indexConvention; - public void buildAll() { - for (EntitySpec entitySpec : _entityRegistry.getEntitySpecs().values()) { - for (AspectSpec aspectSpec : entitySpec.getAspectSpecs()) { - if (aspectSpec.isTimeseries()) { - try { - _indexBuilder.buildIndex( - _indexConvention.getTimeseriesAspectIndexName(entitySpec.getName(), aspectSpec.getName()), - MappingsBuilder.getMappings(aspectSpec), Collections.emptyMap()); - } catch (IOException e) { - log.error("Issue while building timeseries field index for entity {} aspect {}", entitySpec.getName(), - aspectSpec.getName()); - log.error("Exception: ", e); - } - } + @Override + public void reindexAll() { + for (ReindexConfig config : getReindexConfigs()) { + try { + _indexBuilder.buildIndex(config); + } catch (IOException e) { + throw new RuntimeException(e); } } } + + @Override + public List getReindexConfigs() { + return _entityRegistry.getEntitySpecs().values().stream() + .flatMap(entitySpec -> entitySpec.getAspectSpecs().stream() + .map(aspectSpec -> Pair.of(entitySpec, aspectSpec))) + .filter(pair -> pair.getSecond().isTimeseries()) + .map(pair -> { + try { + return _indexBuilder.buildReindexState( + _indexConvention.getTimeseriesAspectIndexName(pair.getFirst().getName(), pair.getSecond().getName()), + MappingsBuilder.getMappings(pair.getSecond()), Collections.emptyMap()); + } catch (IOException e) { + log.error("Issue while building timeseries field index for entity {} aspect {}", pair.getFirst().getName(), + pair.getSecond().getName()); + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java index 2cdbaaced0d5a0..002a5579fb199f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java @@ -126,7 +126,7 @@ protected SearchService searchService( ); // Build indices & write fixture data - indexBuilders.buildAll(); + indexBuilders.reindexAll(); FixtureReader.builder() .bulkProcessor(_bulkProcessor) diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java index 0f5a2cecfd215d..72645393d70f20 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java @@ -171,7 +171,7 @@ protected SearchService searchService( ); // Build indices - indexBuilders.buildAll(); + indexBuilders.reindexAll(); return service; } From ea79de50187bffc555f93d0c4009a097466903cd Mon Sep 17 00:00:00 2001 From: David Leifker Date: Fri, 30 Dec 2022 19:31:51 -0600 Subject: [PATCH 07/12] lint --- .../metadata/kafka/boot/ApplicationStartupListener.java | 1 - .../com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java | 2 -- 2 files changed, 3 deletions(-) diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/boot/ApplicationStartupListener.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/boot/ApplicationStartupListener.java index a45518c4caee35..38e4be56c6278f 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/boot/ApplicationStartupListener.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/boot/ApplicationStartupListener.java @@ -5,7 +5,6 @@ import com.linkedin.metadata.kafka.elasticsearch.indices.BuildIndicesKafkaListener; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.ApplicationListener; import org.springframework.context.annotation.Conditional; diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java index 4a8297c874d140..05168a62129025 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java @@ -14,7 +14,6 @@ import com.linkedin.events.metadata.ChangeType; import com.linkedin.gms.factory.common.GraphServiceFactory; import com.linkedin.gms.factory.common.SystemMetadataServiceFactory; -import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.search.EntitySearchServiceFactory; import com.linkedin.gms.factory.search.SearchDocumentTransformerFactory; @@ -22,7 +21,6 @@ import com.linkedin.metadata.Constants; import com.linkedin.metadata.graph.Edge; import com.linkedin.metadata.graph.GraphService; -import com.linkedin.metadata.kafka.elasticsearch.indices.BuildIndicesKafkaListener; import com.linkedin.metadata.key.SchemaFieldKey; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; From fc9ee63216e83cd91c594b055affebe9db5bfe48 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Fri, 30 Dec 2022 19:43:16 -0600 Subject: [PATCH 08/12] fix test --- .../elasticsearch/indexbuilder/MappingsBuilderTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index dc780e2dc96b4e..8c85d64f5d24ff 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,13 +16,13 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 16); + assertEquals(properties.size(), 19); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", ImmutableMap.of("type", "text", "analyzer", "urn_component", "search_analyzer", "query_urn_component"), "ngram", - ImmutableMap.of("type", "search_as_you_type", "max_shingle_size", "4")))); + ImmutableMap.of("type", "search_as_you_type", "max_shingle_size", "4", "doc_values", "false", "doc_values", "false")))); assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword")); assertTrue(properties.containsKey("browsePaths")); // KEYWORD From 943af8faa6313a095d45e23b82a7cd2e94817a7c Mon Sep 17 00:00:00 2001 From: David Leifker Date: Fri, 30 Dec 2022 22:19:27 -0600 Subject: [PATCH 09/12] fix tests --- .../models/SearchableFieldSpecExtractor.java | 18 ++++++++++++++---- .../indexbuilder/ReindexConfig.java | 2 +- .../query/request/SearchQueryBuilder.java | 18 ++++++++++++------ .../com/linkedin/metadata/ESTestUtils.java | 2 +- .../fixtures/SampleDataFixtureTests.java | 1 - .../indexbuilder/ESIndexBuilderTest.java | 6 ++++-- .../indexbuilder/MappingsBuilderTest.java | 2 +- .../query/request/SearchQueryBuilderTest.java | 8 ++++---- .../request/SearchRequestHandlerTest.java | 5 +++-- .../SearchDocumentTransformerTest.java | 2 +- .../kafka/hook/UpdateIndicesHookTest.java | 8 ++------ 11 files changed, 43 insertions(+), 29 deletions(-) diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java index c610443e9282ab..1ffe74f166b18b 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java @@ -27,16 +27,26 @@ public class SearchableFieldSpecExtractor implements SchemaVisitor { private static final String MAP = "map"; - private static final Map URN_SEARCH_PROPERTIES; + public static final Map PRIMARY_URN_SEARCH_PROPERTIES; + private static final Map SECONDARY_URN_SEARCH_PROPERTIES; static { - URN_SEARCH_PROPERTIES = new DataMap(); - URN_SEARCH_PROPERTIES.putAll( + PRIMARY_URN_SEARCH_PROPERTIES = new DataMap(); + PRIMARY_URN_SEARCH_PROPERTIES.putAll( Map.of( "enableAutocomplete", "true", "fieldType", "URN", "boostScore", "4.0" ) ); + + SECONDARY_URN_SEARCH_PROPERTIES = new DataMap(); + SECONDARY_URN_SEARCH_PROPERTIES.putAll( + Map.of( + "enableAutocomplete", "false", + "fieldType", "URN", + "boostScore", "0.4" + ) + ); } @@ -99,7 +109,7 @@ private Object getAnnotationObj(TraverserContext context) { .getOrDefault("class", "").equals("com.linkedin.common.urn.Urn"); if (isUrn) { - return URN_SEARCH_PROPERTIES; + return SECONDARY_URN_SEARCH_PROPERTIES; } else { // Next, check resolved properties for annotations on primitives. final Map resolvedProperties = FieldSpecUtils.getResolvedProperties(currentSchema); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java index c0c5c4687d79d3..458441f5a2669a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ReindexConfig.java @@ -106,7 +106,7 @@ public ReindexConfig build() { MapDifference mappingsDiff = Maps.difference( (TreeMap) super.currentMappings.getOrDefault("properties", new TreeMap()), (TreeMap) super.targetMappings.getOrDefault("properties", new TreeMap())); - super.requiresApplyMappings = !mappingsDiff.entriesDiffering().isEmpty(); + super.requiresApplyMappings = !mappingsDiff.areEqual(); super.isPureMappingsAddition = !mappingsDiff.areEqual() && mappingsDiff.entriesDiffering().isEmpty() && !mappingsDiff.entriesOnlyOnRight().isEmpty(); if (super.requiresApplyMappings && super.isPureMappingsAddition) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 18e3b44cba51db..9baa46561d92b3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -24,6 +24,7 @@ import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders; +import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_LOWERCASE_ANALYZER; @@ -31,7 +32,7 @@ public class SearchQueryBuilder { private static final Set TYPES_WITH_DELIMITED_SUBFIELD = new HashSet<>(Arrays.asList(FieldType.TEXT, FieldType.TEXT_PARTIAL)); private static final Set TYPES_WITH_NGRAM_SUBFIELD = - new HashSet<>(Arrays.asList(FieldType.TEXT_PARTIAL, FieldType.URN_PARTIAL)); + new HashSet<>(List.of(FieldType.TEXT_PARTIAL)); private SearchQueryBuilder() { } @@ -74,6 +75,12 @@ private static QueryBuilder buildInternalQuery(@Nonnull EntitySpec entitySpec, @ escapedBuilder.analyzer(KEYWORD_LOWERCASE_ANALYZER); } + // Always present + List.of("urn", "urn.delimited", "urn.ngram").forEach(urnField -> { + simpleBuilder.field(urnField, Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore"))); + escapedBuilder.field(urnField, Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore"))); + }); + List searchableFieldSpecs = entitySpec.getSearchableFieldSpecs(); for (SearchableFieldSpec fieldSpec : searchableFieldSpecs) { if (!fieldSpec.getSearchableAnnotation().isQueryByDefault()) { @@ -90,14 +97,13 @@ private static QueryBuilder buildInternalQuery(@Nonnull EntitySpec entitySpec, @ simpleBuilder.field(fieldName + ".delimited", (float) (boostScore * 0.4)); escapedBuilder.field(fieldName + ".delimited", (float) (boostScore * 0.4)); } - if (TYPES_WITH_NGRAM_SUBFIELD.contains(fieldType)) { + if (FieldType.URN_PARTIAL.equals(fieldType)) { + simpleBuilder.field(fieldName + ".delimited", (float) (boostScore * 0.4)); + escapedBuilder.field(fieldName + ".delimited", (float) (boostScore * 0.4)); + } else if (TYPES_WITH_NGRAM_SUBFIELD.contains(fieldType) || fieldSpec.getSearchableAnnotation().isEnableAutocomplete()) { simpleBuilder.field(fieldName + ".ngram", (float) (boostScore * 0.1)); escapedBuilder.field(fieldName + ".ngram", (float) (boostScore * 0.1)); } - if ("urn".equals(fieldName)) { - simpleBuilder.field(fieldName + ".delimited", (float) (boostScore)); - escapedBuilder.field(fieldName + ".delimited", (float) (boostScore)); - } } if (safeMode) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index 5d74fa07e7b18e..c7433600161c94 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -54,7 +54,7 @@ private ESTestUtils() { static { ES_CONTAINER = new ElasticsearchContainer(DOCKER_IMAGE_NAME); checkContainerEngine(ES_CONTAINER.getDockerClient()); - ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx200m -XX:MaxDirectMemorySize=268435456") + ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx384m -XX:MaxDirectMemorySize=368435456") .withStartupTimeout(Duration.ofMinutes(5)); // usually < 1min } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index a84420a31956e9..92d281334e39b2 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -57,7 +57,6 @@ public void testFixtureInitialization() { final SearchResult result = search(searchService, "test"); - // TODO: Add tags fields to query Map expectedTypes = Map.of( "dataset", 8, "chart", 0, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java index 06902758ced557..c854a53fe12528 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java @@ -43,7 +43,8 @@ public class ESIndexBuilderTest extends AbstractTestNGSpringContextTests { @BeforeClass public void setup() { _indexClient = _searchClient.indices(); - testDefaultBuilder = new ESIndexBuilder(_searchClient, 1, 0, 0, 0, Map.of(), false, false); + testDefaultBuilder = new ESIndexBuilder(_searchClient, 1, 0, 0, + 0, Map.of(), false, false); } @BeforeMethod @@ -72,7 +73,8 @@ public static GetIndexResponse getTestIndex() throws IOException { @Test public void testESIndexBuilderCreation() throws Exception { - ESIndexBuilder customIndexBuilder = new ESIndexBuilder(_searchClient, 2, 0, 1, 0, Map.of(), false, false); + ESIndexBuilder customIndexBuilder = new ESIndexBuilder(_searchClient, 2, 0, 1, + 0, Map.of(), false, false); customIndexBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); GetIndexResponse resp = getTestIndex(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index 8c85d64f5d24ff..bfef9702f9d769 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -22,7 +22,7 @@ public void testMappingsBuilder() { ImmutableMap.of("delimited", ImmutableMap.of("type", "text", "analyzer", "urn_component", "search_analyzer", "query_urn_component"), "ngram", - ImmutableMap.of("type", "search_as_you_type", "max_shingle_size", "4", "doc_values", "false", "doc_values", "false")))); + ImmutableMap.of("type", "search_as_you_type", "max_shingle_size", "4", "doc_values", "false")))); assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword")); assertTrue(properties.containsKey("browsePaths")); // KEYWORD diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index c342588c618bd6..a9b26c4c56ca95 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -27,7 +27,7 @@ public void testQueryBuilderFulltext() { assertEquals(simpleQuery.value(), "testQuery"); assertNull(simpleQuery.analyzer()); Map keywordFields = simpleQuery.fields(); - assertEquals(keywordFields.size(), 18); + assertEquals(keywordFields.size(), 23); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); @@ -38,9 +38,9 @@ public void testQueryBuilderFulltext() { assertEquals(escapedQuery.queryString(), "testQuery"); assertNull(escapedQuery.analyzer()); Map textFields = escapedQuery.fields(); - assertEquals(textFields.size(), 18); + assertEquals(textFields.size(), 23); assertEquals(textFields.get("keyPart1.delimited").floatValue(), 4.0f); - assertFalse(textFields.containsKey("keyPart1.ngram")); + assertTrue(textFields.containsKey("keyPart1.ngram")); assertEquals(textFields.get("textFieldOverride.delimited").floatValue(), 0.4f); assertFalse(textFields.containsKey("textFieldOverride.ngram")); assertEquals(textFields.get("textArrayField.delimited").floatValue(), 0.4f); @@ -64,7 +64,7 @@ public void testQueryBuilderStructured() { assertEquals(keywordQuery.queryString(), "testQuery"); assertEquals(keywordQuery.analyzer(), "custom_keyword"); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 18); + assertEquals(keywordFields.size(), 23); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index e6a4840d71b09a..9a44d4ec10ebee 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -43,10 +43,11 @@ public void testSearchRequestHandler() { HighlightBuilder highlightBuilder = sourceBuilder.highlighter(); List fields = highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList()); - assertEquals(fields.size(), 20); + assertEquals(fields.size(), 24); List highlightableFields = ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey", - "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "urn"); + "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "keyPart2", + "nestedArrayForeignKey", "foreignKeyArray"); highlightableFields.forEach(field -> { assertTrue(fields.contains(field)); assertTrue(fields.contains(field + ".*")); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 1bd65769e51da7..c343ea97a2c43a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -30,7 +30,7 @@ public void testTransform() throws IOException { ObjectNode parsedJson = (ObjectNode) OBJECT_MAPPER.readTree(result.get()); assertEquals(parsedJson.get("urn").asText(), snapshot.getUrn().toString()); assertEquals(parsedJson.get("keyPart1").asText(), "key"); - assertFalse(parsedJson.has("keyPart2")); + assertTrue(parsedJson.has("keyPart2")); assertEquals(parsedJson.get("keyPart3").asText(), "VALUE_1"); assertFalse(parsedJson.has("textField")); assertEquals(parsedJson.get("textFieldOverride").asText(), "test"); diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java index a153f6152e7ce2..983794db7f87a1 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java @@ -91,9 +91,7 @@ public void setupTest() { _mockTimeseriesAspectService, _mockSystemMetadataService, registry, - _mockSearchDocumentTransformer, - _mockBuildIndicesKafkaListener, - _mockConfigurationProvider + _mockSearchDocumentTransformer ); } @@ -125,9 +123,7 @@ public void testInputFieldsEdgesAreAdded() throws Exception { _mockTimeseriesAspectService, _mockSystemMetadataService, mockEntityRegistry, - _mockSearchDocumentTransformer, - _mockBuildIndicesKafkaListener, - _mockConfigurationProvider + _mockSearchDocumentTransformer ); _updateIndicesHook.invoke(event); From e4725d5d2b8427559906e6f3d36d02004badac89 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Sat, 31 Dec 2022 13:41:38 -0600 Subject: [PATCH 10/12] Support aliases during clone Add default clone setting --- .../buildindices/PreConfigureESStep.java | 38 ++++++++++++++++--- .../src/main/resources/application.yml | 1 + 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java index 41e87cb33759f3..85ce5fb8a1f223 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/buildindices/PreConfigureESStep.java @@ -7,8 +7,11 @@ import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; + +import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -16,12 +19,15 @@ import com.linkedin.metadata.shared.ElasticSearchIndexed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.NotImplementedException; import org.elasticsearch.ElasticsearchStatusException; +import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest; import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.elasticsearch.client.GetAliasesResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.indices.ResizeRequest; -import static com.linkedin.datahub.upgrade.buildindices.IndexUtils.*; +import static com.linkedin.datahub.upgrade.buildindices.IndexUtils.getAllReindexConfigs; @RequiredArgsConstructor @@ -52,7 +58,23 @@ public Function executable() { .collect(Collectors.toList()); for (ReindexConfig indexConfig : indexConfigs) { - UpdateSettingsRequest request = new UpdateSettingsRequest(indexConfig.name()); + String indexName = indexConfig.name(); + + GetAliasesResponse aliasResponse = getAlias(indexConfig.name()); + if (!aliasResponse.getAliases().isEmpty()) { + Set indices = aliasResponse.getAliases().keySet(); + if (indices.size() != 1) { + throw new NotImplementedException( + String.format("Clone not supported for %s indices in alias %s. Indicies: %s", + indices.size(), + indexConfig.name(), + String.join(",", indices))); + } + indexName = indices.stream().findFirst().get(); + log.info("Alias {} resolved to index {}", indexConfig.name(), indexName); + } + + UpdateSettingsRequest request = new UpdateSettingsRequest(indexName); Map indexSettings = ImmutableMap.of("index.blocks.write", "true"); request.settings(indexSettings); @@ -60,6 +82,7 @@ public Function executable() { try { ack = _esComponents.getSearchClient().indices().putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); + log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, indexSettings, ack); } catch (ElasticsearchStatusException ese) { // Cover first run case, indices won't exist so settings updates won't work nor will the rest of the preConfigure steps. // Since no data are in there they are skippable. @@ -69,7 +92,7 @@ public Function executable() { } throw ese; } - log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexConfig.name(), indexSettings, ack); + if (!ack) { log.error("Partial index settings update, some indices may still be blocking writes." + " Please fix the error and re-run the BuildIndices upgrade job."); @@ -79,10 +102,10 @@ public Function executable() { // Clone indices if (_configurationProvider.getElasticSearch().getBuildIndices().isCloneIndices()) { String clonedName = indexConfig.name() + "_clone_" + System.currentTimeMillis(); - ResizeRequest resizeRequest = new ResizeRequest(clonedName, indexConfig.name()); + ResizeRequest resizeRequest = new ResizeRequest(clonedName, indexName); boolean cloneAck = _esComponents.getSearchClient().indices().clone(resizeRequest, RequestOptions.DEFAULT).isAcknowledged(); - log.info("Cloned index {} into {}, Acknowledged: {}", indexConfig.name(), clonedName, cloneAck); + log.info("Cloned index {} into {}, Acknowledged: {}", indexName, clonedName, cloneAck); if (!cloneAck) { log.error("Partial index settings update, cloned indices may need to be cleaned up: {}", clonedName); return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.FAILED); @@ -96,4 +119,9 @@ public Function executable() { return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.SUCCEEDED); }; } + + private GetAliasesResponse getAlias(String name) throws IOException { + return _esComponents.getSearchClient().indices() + .getAlias(new GetAliasesRequest(name), RequestOptions.DEFAULT); + } } diff --git a/metadata-service/factories/src/main/resources/application.yml b/metadata-service/factories/src/main/resources/application.yml index 708fb23df34b27..556930eb160980 100644 --- a/metadata-service/factories/src/main/resources/application.yml +++ b/metadata-service/factories/src/main/resources/application.yml @@ -165,6 +165,7 @@ elasticsearch: settingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_OVERRIDES:#{null}} entitySettingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_ENTITY_SETTINGS_OVERRIDES:#{null}} buildIndices: + cloneIndices: ${ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES:true} initialBackOffMs: ${ELASTICSEARCH_BUILD_INDICES_INITIAL_BACK_OFF_MILLIS:5000} maxBackOffs: ${ELASTICSEARCH_BUILD_INDICES_MAX_BACK_OFFS:5} backOffFactor: ${ELASTICSEARCH_BUILD_INDICES_BACK_OFF_FACTOR:2} # Multiplicative factor for back off, default values will result in waiting 5min 15s From 626cad7c047ed026f78fdb66b31c956ccae08952 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Sun, 1 Jan 2023 23:27:03 -0600 Subject: [PATCH 11/12] fulltext to structured flag set default to enable fulltext --- .../datahub-gms/env/docker-without-neo4j.env | 1 + docker/datahub-gms/env/docker.cassandra.env | 1 + docker/datahub-gms/env/docker.mariadb.env | 1 + docker/datahub-gms/env/docker.postgres.env | 1 + .../env/docker-without-neo4j.env | 1 + ...ocker-compose-without-neo4j.quickstart.yml | 1 + ...ose.consumers-without-neo4j.quickstart.yml | 1 + .../metadata/client/JavaEntityClient.java | 20 +++++----- .../metadata/search/SearchService.java | 3 +- .../client/CachingEntitySearchService.java | 10 ++--- .../elasticsearch/ElasticSearchService.java | 4 +- .../indexbuilder/SettingsBuilder.java | 13 ++++++- .../elasticsearch/query/ESSearchDAO.java | 6 +-- .../query/request/SearchQueryBuilder.java | 8 ++-- .../query/request/SearchRequestHandler.java | 10 ++--- .../com/linkedin/metadata/ESTestUtils.java | 8 +++- .../metadata/search/SearchServiceTest.java | 34 +++++++++++------ .../ElasticSearchServiceTest.java | 4 +- .../fixtures/SampleDataFixtureTests.java | 38 ++++++++++++++++++- .../query/request/SearchQueryBuilderTest.java | 4 +- .../linkedin/metadata/query/SearchFlags.pdl | 2 +- ...com.linkedin.entity.entities.restspec.json | 16 +++++++- ...com.linkedin.entity.entities.snapshot.json | 6 ++- .../linkedin/entity/client/EntityClient.java | 4 +- .../entity/client/RestliEntityClient.java | 8 ++-- .../resources/entity/EntityResource.java | 14 ++++--- smoke-test/smoke-dev.sh | 29 -------------- 27 files changed, 152 insertions(+), 96 deletions(-) delete mode 100755 smoke-test/smoke-dev.sh diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index e1917f3bbd2b94..ee6ed973cbb5bb 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=mysql:3306 diff --git a/docker/datahub-gms/env/docker.cassandra.env b/docker/datahub-gms/env/docker.cassandra.env index 18263b297e7a71..ed265d0c53dd10 100644 --- a/docker/datahub-gms/env/docker.cassandra.env +++ b/docker/datahub-gms/env/docker.cassandra.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms KAFKA_BOOTSTRAP_SERVER=broker:29092 KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 ELASTICSEARCH_HOST=elasticsearch diff --git a/docker/datahub-gms/env/docker.mariadb.env b/docker/datahub-gms/env/docker.mariadb.env index a40126d7208c9d..968fc8788afe0b 100644 --- a/docker/datahub-gms/env/docker.mariadb.env +++ b/docker/datahub-gms/env/docker.mariadb.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=mariadb:3306 diff --git a/docker/datahub-gms/env/docker.postgres.env b/docker/datahub-gms/env/docker.postgres.env index f99134ebb02388..13d0e53a170edd 100644 --- a/docker/datahub-gms/env/docker.postgres.env +++ b/docker/datahub-gms/env/docker.postgres.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=postgres:5432 diff --git a/docker/datahub-mae-consumer/env/docker-without-neo4j.env b/docker/datahub-mae-consumer/env/docker-without-neo4j.env index 183d66987e3585..9c6d3e88aea8ab 100644 --- a/docker/datahub-mae-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mae-consumer/env/docker-without-neo4j.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-mcl DATAHUB_GMS_HOST=datahub-gms DATAHUB_GMS_PORT=8080 diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 7fb9d263094db1..172b00edb5be1a 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -76,6 +76,7 @@ services: environment: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms - EBEAN_DATASOURCE_USERNAME=datahub - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_HOST=mysql:3306 diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index 614034a3447042..48b0cdef426c96 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -6,6 +6,7 @@ services: datahub-mae-consumer: container_name: datahub-mae-consumer environment: + - BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-mcl - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - MAE_CONSUMER_ENABLED=true diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index f94e6b3eb3774a..9af5149bd2d0b5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -28,6 +28,7 @@ import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.ListResult; import com.linkedin.metadata.query.ListUrnsResult; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.entity.AspectUtils; @@ -248,15 +249,15 @@ public SearchResult search( int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { - if (Optional.ofNullable(structured).orElse(true)) { + if (Optional.ofNullable(fulltext).orElse(false)) { return ValidationUtils.validateSearchResult( - _entitySearchService.structuredSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); + _entitySearchService.fullTextSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); } else { return ValidationUtils.validateSearchResult( - _entitySearchService.fullTextSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); + _entitySearchService.structuredSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); } } @@ -305,15 +306,15 @@ public SearchResult search( int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { - if (Optional.ofNullable(structured).orElse(true)) { + if (Optional.ofNullable(fulltext).orElse(false)) { return ValidationUtils.validateSearchResult( - _entitySearchService.structuredSearch(entity, input, filter, sortCriterion, start, count), + _entitySearchService.fullTextSearch(entity, input, filter, sortCriterion, start, count), _entityService); } else { return ValidationUtils.validateSearchResult( - _entitySearchService.fullTextSearch(entity, input, filter, sortCriterion, start, count), + _entitySearchService.structuredSearch(entity, input, filter, sortCriterion, start, count), _entityService); } } @@ -338,7 +339,8 @@ public SearchResult searchAcrossEntities( int count, @Nonnull final Authentication authentication) throws RemoteInvocationException { return ValidationUtils.validateSearchResult( - _searchService.searchAcrossEntities(entities, input, filter, null, start, count, null), _entityService); + _searchService.searchAcrossEntities(entities, input, filter, null, start, count, + new SearchFlags().setFulltext(true)), _entityService); } @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java index 1b136e758ea2d6..220cc6971a3b3a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java @@ -88,7 +88,6 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul log.debug(String.format( "Searching Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entities, input, postFilters, sortCriterion, from, size)); - SearchFlags forceFlags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()).setStructured(false); - return _cachingAllEntitiesSearchAggregator.getSearchResults(entities, input, postFilters, sortCriterion, from, size, forceFlags); + return _cachingAllEntitiesSearchAggregator.getSearchResults(entities, input, postFilters, sortCriterion, from, size, searchFlags); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index 1572304c7bdf00..fd6e9b88e89e28 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -118,7 +118,7 @@ public SearchResult getCachedSearchResults( cacheManager.getCache(ENTITY_SEARCH_SERVICE_SEARCH_CACHE_NAME), batchSize, querySize -> getRawSearchResults(entityName, query, filters, sortCriterion, querySize.getFrom(), - querySize.getSize(), searchFlags.isStructured()), + querySize.getSize(), Boolean.TRUE.equals(searchFlags.isFulltext())), querySize -> Quintet.with(entityName, query, filters, sortCriterion, querySize), flags, enableCache).getSearchResults(from, size); } @@ -197,9 +197,9 @@ private SearchResult getRawSearchResults( final SortCriterion sortCriterion, final int start, final int count, - final boolean structured) { - if (structured) { - return entitySearchService.structuredSearch( + final boolean fulltext) { + if (fulltext) { + return entitySearchService.fullTextSearch( entityName, input, filters, @@ -207,7 +207,7 @@ private SearchResult getRawSearchResults( start, count); } else { - return entitySearchService.fullTextSearch( + return entitySearchService.structuredSearch( entityName, input, filters, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index 78347af4ff49c2..ea09e3a4b258bc 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -108,7 +108,7 @@ public SearchResult fullTextSearch(@Nonnull String entityName, @Nonnull String i log.debug(String.format( "Searching FullText Search documents entityName: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entityName, input, postFilters, sortCriterion, from, size)); - return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, false); + return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, true); } @Nonnull @@ -118,7 +118,7 @@ public SearchResult structuredSearch(@Nonnull String entityName, @Nonnull String log.debug(String.format( "Searching Structured Search documents entityName: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entityName, input, postFilters, sortCriterion, from, size)); - return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, true); + return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, false); } @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index db574cc9e93214..406438ecc1e9ec 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -40,6 +40,7 @@ public class SettingsBuilder { public static final String NORMALIZER = "normalizer"; public static final String PATTERN = "pattern"; public static final String PATTERNS = "patterns"; + public static final String REPLACEMENT = "replacement"; public static final String PRESERVE_ORIGINAL = "preserve_original"; public static final String SEARCH_ANALYZER = "search_analyzer"; public static final String SPLIT_ON_NUMERICS = "split_on_numerics"; @@ -69,6 +70,7 @@ public class SettingsBuilder { public static final String FLATTEN_GRAPH = "flatten_graph"; public static final String LOWERCASE = "lowercase"; public static final String MIN_LENGTH_2 = "min_length_2"; + public static final String REPLACE_NUM_LENGTH_3 = "replace_num_length_3"; public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; @@ -100,6 +102,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final List ALPHA_ONLY_PATTERNS = ImmutableList.of("([a-z0-9]{2,})"); + public static final String NUM_LENGTH_3_PATTERN = "(^[0-9]{1,3}$)"; public static final List URN_STOP_WORDS = ImmutableList.of("urn", "li"); public final Map settings; @@ -136,7 +139,7 @@ private static Map buildFilters() throws IOException { // Filter to split string into words filters.put(CUSTOM_DELIMITER, ImmutableMap.builder() .put(TYPE, WORD_DELIMITER) - .put(SPLIT_ON_NUMERICS, false) + .put(SPLIT_ON_NUMERICS, true) .put(PRESERVE_ORIGINAL, true) .put(TYPE_TABLE, ImmutableList.of( COLON_SUBWORD_DELIMITER @@ -145,7 +148,7 @@ private static Map buildFilters() throws IOException { filters.put(CUSTOM_DELIMITER_GRAPH, ImmutableMap.builder() .put(TYPE, WORD_DELIMITER_GRAPH) - .put(SPLIT_ON_NUMERICS, false) + .put(SPLIT_ON_NUMERICS, true) .put(PRESERVE_ORIGINAL, true) .put(TYPE_TABLE, ImmutableList.of( COLON_SUBWORD_DELIMITER @@ -187,6 +190,12 @@ private static Map buildFilters() throws IOException { .put(PATTERNS, ALPHA_ONLY_PATTERNS) .build()); + filters.put(REPLACE_NUM_LENGTH_3, ImmutableMap.builder() + .put(TYPE, "pattern_replace") + .put(PATTERN, NUM_LENGTH_3_PATTERN) + .put(REPLACEMENT, "") + .build()); + filters.put(SHINGLE_2_3, ImmutableMap.builder() .put(TYPE, "shingle") .put(MIN_SHINGLE_SIZE, "2") diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index d75597e306843f..094ebb74fa48cb 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -72,18 +72,18 @@ private SearchResult executeAndExtract(@Nonnull EntitySpec entitySpec, @Nonnull * @param sortCriterion {@link SortCriterion} to be applied to search results * @param from index to start the search from * @param size the number of search hits to return - * @param structured Structured or full text search modes + * @param fulltext Structured or full text search modes * @return a {@link com.linkedin.metadata.dao.SearchResult} that contains a list of matched documents and related search result metadata */ @Nonnull public SearchResult search(@Nonnull String entityName, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, int from, int size, boolean structured) { + @Nullable SortCriterion sortCriterion, int from, int size, boolean fulltext) { final String finalInput = input.isEmpty() ? "*" : input; Timer.Context searchRequestTimer = MetricUtils.timer(this.getClass(), "searchRequest").time(); EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); // Step 1: construct the query final SearchRequest searchRequest = SearchRequestHandler.getBuilder(entitySpec) - .getSearchRequest(finalInput, postFilters, sortCriterion, from, size, structured); + .getSearchRequest(finalInput, postFilters, sortCriterion, from, size, fulltext); searchRequest.indices(indexConvention.getIndexName(entitySpec)); searchRequestTimer.stop(); // Step 2: execute the query and extract results, validated against document model as well diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 9baa46561d92b3..1989994d70ee88 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -37,12 +37,12 @@ public class SearchQueryBuilder { private SearchQueryBuilder() { } - public static QueryBuilder buildQuery(@Nonnull EntitySpec entitySpec, @Nonnull String query, boolean structured) { + public static QueryBuilder buildQuery(@Nonnull EntitySpec entitySpec, @Nonnull String query, boolean fulltext) { final QueryBuilder queryBuilder; - if (structured) { - queryBuilder = buildInternalQuery(entitySpec, query, false, true); - } else { + if (fulltext) { queryBuilder = buildInternalQuery(entitySpec, query, true, false); + } else { + queryBuilder = buildInternalQuery(entitySpec, query, false, true); } return QueryBuilders.functionScoreQuery(queryBuilder, buildScoreFunctions(entitySpec)) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 64a5b37741a8e7..fe2522882b3a3e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -154,13 +154,13 @@ public static BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { * @param filter the search filter * @param from index to start the search from * @param size the number of search hits to return - * @param structured Structured or full text search modes + * @param fulltext Structured or full text search modes * @return a valid search request */ @Nonnull @WithSpan public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter filter, - @Nullable SortCriterion sortCriterion, int from, int size, boolean structured) { + @Nullable SortCriterion sortCriterion, int from, int size, boolean fulltext) { SearchRequest searchRequest = new SearchRequest(); SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); @@ -170,7 +170,7 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery() - .must(getQuery(input, structured)) + .must(getQuery(input, fulltext)) .must(filterQuery)); getAggregations().forEach(searchSourceBuilder::aggregation); searchSourceBuilder.highlighter(getHighlights()); @@ -228,8 +228,8 @@ public static SearchRequest getAggregationRequest(@Nonnull String field, @Nullab return searchRequest; } - private QueryBuilder getQuery(@Nonnull String query, boolean structured) { - return SearchQueryBuilder.buildQuery(_entitySpec, query, structured); + private QueryBuilder getQuery(@Nonnull String query, boolean fulltext) { + return SearchQueryBuilder.buildQuery(_entitySpec, query, fulltext); } private List getAggregations() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index c7433600161c94..0281275d700018 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -11,6 +11,7 @@ import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.metadata.graph.LineageDirection; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.search.LineageSearchResult; import com.linkedin.metadata.search.LineageSearchService; import com.linkedin.metadata.search.SearchResult; @@ -68,7 +69,12 @@ private ESTestUtils() { public static SearchResult search(SearchService searchService, String query) { return searchService.searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, 0, - 100, null); + 100, new SearchFlags().setFulltext(true)); + } + + public static SearchResult searchStructured(SearchService searchService, String query) { + return searchService.searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, 0, + 100, new SearchFlags().setFulltext(false)); } public static LineageSearchResult lineage(LineageSearchService lineageSearchService, Urn root, int hops) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java index 56f249c247bbf6..647b16ad353efb 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java @@ -10,6 +10,7 @@ import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; @@ -122,9 +123,11 @@ private void clearCache() { @Test public void testSearchService() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", null, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -137,7 +140,8 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); clearCache(); @@ -151,7 +155,8 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test2", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn2); clearCache(); @@ -159,7 +164,8 @@ public void testSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test2", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); } @@ -187,7 +193,8 @@ public void testAdvancedSearchOr() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -224,7 +231,8 @@ public void testAdvancedSearchOr() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 2); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); assertEquals(searchResult.getEntities().get(1).getEntity(), urn2); @@ -253,7 +261,8 @@ public void testAdvancedSearchSoftDelete() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -293,7 +302,8 @@ public void testAdvancedSearchSoftDelete() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); clearCache(); @@ -316,7 +326,8 @@ public void testAdvancedSearchNegated() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -356,7 +367,8 @@ public void testAdvancedSearchNegated() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn3); clearCache(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java index 28ef4d008a809d..25e2cf07c7e37d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java @@ -176,7 +176,7 @@ public void testElasticSearchServiceFulltext() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "test2", null, null, 0, 10); + searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "'test2'", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn2); @@ -187,7 +187,7 @@ public void testElasticSearchServiceFulltext() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "test2", null, null, 0, 10); + searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "'test2'", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 0); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 0); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index 92d281334e39b2..d3748658f3d8e4 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -30,6 +30,7 @@ import static com.linkedin.metadata.ESTestUtils.autocomplete; import static com.linkedin.metadata.ESTestUtils.search; +import static com.linkedin.metadata.ESTestUtils.searchStructured; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertNotNull; @@ -283,7 +284,7 @@ public void testTokenizationWithNumber() throws IOException { assertEquals(tokens, List.of( "harshal-playground-306419", "harshal", "playground", "306419", "test_schema", "test", "schema", - "austin311_deriv", "austin311", "deriv"), + "austin311_deriv", "austin311", "deriv", "austin", "311"), String.format("Unexpected tokens. Found %s", tokens)); request = AnalyzeRequest.withIndexAnalyzer( @@ -295,7 +296,7 @@ public void testTokenizationWithNumber() throws IOException { assertEquals(tokens, List.of( "harshal-playground-306419", "harshal", "playground", "306419", "test_schema", "test", "schema", - "austin311_deriv", "austin311", "deriv"), + "austin311_deriv", "austin311", "deriv", "austin", "311"), String.format("Unexpected tokens. Found %s", tokens)); } @@ -327,6 +328,7 @@ public void testTokenizationDataPlatform() throws IOException { "urn:li:dataplatform:hive", "data", "dataplatform", "platform", "hive", "samplehivedataset-ac611929-c3ac-4b92-aafb-f4603ddb408a", "samplehivedataset", "ac611929", "c3ac", "4b92", "aafb", "f4603ddb408a", "sampl", + "ac", "611929", "92", "4603", "ddb", "408", "prod", "production"), String.format("Unexpected tokens. Found %s", tokens)); @@ -358,6 +360,38 @@ public void testChartAutoComplete() throws InterruptedException { }); } + @Test + public void testSmokeTestQueries() { + Map expectedMinimums = Map.of( + "sample", 3, + "covid", 1 + ); + + Map results = expectedMinimums.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> search(searchService, entry.getKey()))); + + results.forEach((key, value) -> { + Integer actualCount = value.getEntities().size(); + Integer expectedCount = expectedMinimums.get(key); + assertTrue(actualCount >= expectedCount, + String.format("Search term `%s` has %s fulltext results, expected %s results.", key, + actualCount, expectedCount)); + }); + } + + @Test + public void testMinNumberLengthLimit() throws IOException { + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_delimited", + "data2022.data22" + ); + List expected = List.of("data2022", "data", "2022", "data22", "22"); + List actual = getTokens(request).map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()); + assertEquals(actual, expected, + String.format("Expected: %s Actual: %s", expected, actual)); + } + private Stream getTokens(AnalyzeRequest request) throws IOException { return _searchClient.indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a9b26c4c56ca95..26cc264072901a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -18,7 +18,7 @@ public class SearchQueryBuilderTest { public void testQueryBuilderFulltext() { FunctionScoreQueryBuilder result = (FunctionScoreQueryBuilder) SearchQueryBuilder.buildQuery(TestEntitySpecBuilder.getSpec(), "testQuery", - false); + true); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 2); @@ -55,7 +55,7 @@ public void testQueryBuilderFulltext() { public void testQueryBuilderStructured() { FunctionScoreQueryBuilder result = (FunctionScoreQueryBuilder) SearchQueryBuilder.buildQuery(TestEntitySpecBuilder.getSpec(), "testQuery", - true); + false); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 1); diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index 7ce19971d9f45e..9448dbf5f8aaef 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -17,5 +17,5 @@ record SearchFlags { /** * Structured or unstructured fulltext query */ - structured: boolean = true + fulltext:optional boolean } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json index 0e65e5152c07a6..534d0c29eeb44d 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json @@ -263,8 +263,10 @@ "name" : "count", "type" : "int" }, { - "name" : "structured", - "type" : "boolean" + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { @@ -290,6 +292,11 @@ }, { "name" : "count", "type" : "int" + }, { + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { @@ -326,6 +333,11 @@ }, { "name" : "count", "type" : "int" + }, { + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.LineageSearchResult" }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 1c77f4a72966fd..0de11a0229fac5 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -6272,8 +6272,10 @@ "name" : "count", "type" : "int" }, { - "name" : "structured", - "type" : "boolean" + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index 7b81bc70d98ace..da58f077341b2d 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -134,7 +134,7 @@ public void batchUpdate(@Nonnull final Set entities, @Nonnull final Auth @Nonnull public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Map requestFilters, int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException; /** @@ -164,7 +164,7 @@ public ListResult list(@Nonnull String entity, @Nullable Map req @Nonnull public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Filter filter, SortCriterion sortCriterion, int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) throws RemoteInvocationException; + @Nullable Boolean fulltext) throws RemoteInvocationException; /** * Searches for entities matching to a given query and filters across multiple entity types diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index dd665502a451c3..cb3d43a9f66af2 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -330,7 +330,7 @@ public void batchUpdate(@Nonnull final Set entities, @Nonnull final Auth @Override public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Map requestFilters, int start, int count, @Nonnull final Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { final EntitiesDoSearchRequestBuilder requestBuilder = ENTITIES_REQUEST_BUILDERS.actionSearch() @@ -339,7 +339,7 @@ public SearchResult search(@Nonnull String entity, @Nonnull String input, .filterParam(newFilter(requestFilters)) .startParam(start) .countParam(count) - .structuredParam(structured); + .fulltextParam(fulltext); return sendClientRequest(requestBuilder, authentication).getEntity(); } @@ -380,7 +380,7 @@ public ListResult list(@Nonnull String entity, @Nullable Map req @Override public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Filter filter, SortCriterion sortCriterion, int start, int count, @Nonnull final Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { final EntitiesDoSearchRequestBuilder requestBuilder = ENTITIES_REQUEST_BUILDERS.actionSearch() @@ -388,7 +388,7 @@ public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nulla .inputParam(input) .startParam(start) .countParam(count) - .structuredParam(structured); + .fulltextParam(fulltext); if (filter != null) { requestBuilder.filterParam(filter); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 0ce05086b53a4e..6a449681923449 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -21,6 +21,7 @@ import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.ListResult; import com.linkedin.metadata.query.ListUrnsResult; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; @@ -99,7 +100,7 @@ public class EntityResource extends CollectionResourceTaskTemplate batchIngest(@ActionParam(PARAM_ENTITIES) @Nonnull Entity[] ent public Task search(@ActionParam(PARAM_ENTITY) @Nonnull String entityName, @ActionParam(PARAM_INPUT) @Nonnull String input, @ActionParam(PARAM_FILTER) @Optional @Nullable Filter filter, @ActionParam(PARAM_SORT) @Optional @Nullable SortCriterion sortCriterion, @ActionParam(PARAM_START) int start, - @ActionParam(PARAM_COUNT) int count, @ActionParam(PARAM_STRUCTURED) Boolean structured) { + @ActionParam(PARAM_COUNT) int count, @Optional @Nullable @ActionParam(PARAM_FULLTEXT) Boolean fulltext) { log.info("GET SEARCH RESULTS for {} with query {}", entityName, input); // TODO - change it to use _searchService once we are confident on it's latency return RestliUtil.toTask( () -> { final SearchResult result; - if (structured) { - result = _entitySearchService.structuredSearch(entityName, input, filter, sortCriterion, start, count); - } else { + if (Boolean.TRUE.equals(fulltext)) { result = _entitySearchService.fullTextSearch(entityName, input, filter, sortCriterion, start, count); + } else { + result = _entitySearchService.structuredSearch(entityName, input, filter, sortCriterion, start, count); } return validateSearchResult(result, _entityService); }, @@ -296,7 +297,8 @@ public Task searchAcrossEntities(@ActionParam(PARAM_ENTITIES) @Opt List entityList = entities == null ? Collections.emptyList() : Arrays.asList(entities); log.info("GET SEARCH RESULTS ACROSS ENTITIES for {} with query {}", entityList, input); return RestliUtil.toTask(() -> validateSearchResult( - _searchService.searchAcrossEntities(entityList, input, filter, sortCriterion, start, count, null), + _searchService.searchAcrossEntities(entityList, input, filter, sortCriterion, start, count, + new SearchFlags().setFulltext(true)), _entityService), "searchAcrossEntities"); } diff --git a/smoke-test/smoke-dev.sh b/smoke-test/smoke-dev.sh deleted file mode 100755 index 9237065e948352..00000000000000 --- a/smoke-test/smoke-dev.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -# Runs a basic e2e test. It is not meant to be fully comprehensive, -# but rather should catch obvious bugs before they make it into prod. -# -# Script assumptions: -# - The gradle build has already been run. -# - Python 3.6+ is installed and in the PATH. - -# Log the locally loaded images -# docker images | grep "datahub-" - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd "$DIR" - -python3 -m venv venv -source venv/bin/activate -pip install --upgrade pip wheel setuptools -pip install -r requirements.txt - -echo "DATAHUB_VERSION = ${DATAHUB_VERSION:=acryl-datahub 0.0.0.dev0}" -DATAHUB_TELEMETRY_ENABLED=false \ -DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --build-locally --standalone_consumers --dump-logs-on-failure - -(cd ..; ./gradlew :smoke-test:yarnInstall) - -pytest -rP --durations=20 -vv --junit-xml=junit.smoke.xml $@ From 35dc594ed3c4c9fc1cef8f72415796115f51f28e Mon Sep 17 00:00:00 2001 From: David Leifker Date: Mon, 2 Jan 2023 07:21:06 -0600 Subject: [PATCH 12/12] fix non-fulltext path with stemming --- .../com/linkedin/metadata/search/SearchService.java | 1 - .../elasticsearch/indexbuilder/SettingsBuilder.java | 2 +- .../fixtures/SampleDataFixtureTests.java | 11 +++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java index 220cc6971a3b3a..91937a862e360f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java @@ -9,7 +9,6 @@ import com.linkedin.metadata.search.ranker.SearchRanker; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; import javax.annotation.Nonnull; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 406438ecc1e9ec..815419b2db54a1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -324,7 +324,7 @@ private static Map buildAnalyzers(String mainTokenizer) { // Analyzer for case-insensitive exact matching - Only used when building queries analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.builder() .put(TOKENIZER, KEYWORD_TOKENIZER) - .put(FILTER, ImmutableList.of("trim", LOWERCASE, ASCII_FOLDING)) + .put(FILTER, ImmutableList.of("trim", LOWERCASE, ASCII_FOLDING, SNOWBALL)) .build()); // Analyzer for getting urn components diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index d3748658f3d8e4..604338393827df 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -377,6 +377,17 @@ public void testSmokeTestQueries() { String.format("Search term `%s` has %s fulltext results, expected %s results.", key, actualCount, expectedCount)); }); + + results = expectedMinimums.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> searchStructured(searchService, entry.getKey()))); + + results.forEach((key, value) -> { + Integer actualCount = value.getEntities().size(); + Integer expectedCount = expectedMinimums.get(key); + assertTrue(actualCount >= expectedCount, + String.format("Search term `%s` has %s structured results, expected %s results.", key, + actualCount, expectedCount)); + }); } @Test