diff --git a/.github/workflows/deploy_release.yml b/.github/workflows/deploy_release.yml index 8e1d440f..53f24a28 100644 --- a/.github/workflows/deploy_release.yml +++ b/.github/workflows/deploy_release.yml @@ -25,32 +25,30 @@ jobs: key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-maven- - - name: download neo4j-contrib & graphframes & pulsar-spark-connector dependency - run: | - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/neo4j-contrib.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/graphframes.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/streamnative.zip - unzip -o -d ~/.m2/repository/ neo4j-contrib.zip - unzip -o -d ~/.m2/repository/ graphframes.zip - rm -rf ~/.m2/repository/io/streamnative - unzip -o -d ~/.m2/repository/io/ streamnative.zip - - name: Install nebula-graph run: | mkdir tmp pushd tmp git clone https://github.com/vesoft-inc/nebula-docker-compose.git pushd nebula-docker-compose/ - cp ../../nebula-exchange/src/test/resources/docker-compose.yaml . + cp ../../exchange-common/src/test/resources/docker-compose.yaml . docker-compose up -d sleep 10 popd popd - - name: Deploy release to Maven - uses: samuelmeuli/action-maven-publish@v1 + - name: Build with Maven + run: | + mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 + mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 + mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 + + - name: upload to release assets + uses: softprops/action-gh-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }} - gpg_passphrase: ${{ secrets.GPG_PASSPHRASE }} - nexus_username: ${{ secrets.OSSRH_USERNAME }} - nexus_password: ${{ secrets.OSSRH_TOKEN }} + files: | + nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-*-SNAPSHOT.jar + nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-*-SNAPSHOT.jar + nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-*-SNAPSHOT.jar diff --git a/.github/workflows/deploy_snapshot.yml b/.github/workflows/deploy_snapshot.yml index df62ae1f..68cf47b0 100644 --- a/.github/workflows/deploy_snapshot.yml +++ b/.github/workflows/deploy_snapshot.yml @@ -27,32 +27,30 @@ jobs: key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-maven- - - name: download neo4j-contrib & graphframes & pulsar-spark-connector dependency - run: | - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/neo4j-contrib.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/graphframes.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/streamnative.zip - unzip -o -d ~/.m2/repository/ neo4j-contrib.zip - unzip -o -d ~/.m2/repository/ graphframes.zip - rm -rf ~/.m2/repository/io/streamnative - unzip -o -d ~/.m2/repository/io/ streamnative.zip - - name: Install nebula-graph run: | mkdir tmp pushd tmp git clone https://github.com/vesoft-inc/nebula-docker-compose.git pushd nebula-docker-compose/ - cp ../../nebula-exchange/src/test/resources/docker-compose.yaml . + cp ../../exchange-common/src/test/resources/docker-compose.yaml . docker-compose up -d sleep 10 popd popd - - name: Deploy SNAPSHOT to Sonatype - uses: samuelmeuli/action-maven-publish@v1 + - name: Build with Maven + run: | + mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 + mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 + mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 + + - name: upload to snapshot assets + uses: softprops/action-gh-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }} - gpg_passphrase: ${{ secrets.GPG_PASSPHRASE }} - nexus_username: ${{ secrets.OSSRH_USERNAME }} - nexus_password: ${{ secrets.OSSRH_TOKEN }} + files: | + nebula-exchange_spark_2.2/target/nebula-exchange_spark_2.2-*-SNAPSHOT.jar + nebula-exchange_spark_2.4/target/nebula-exchange_spark_2.4-*-SNAPSHOT.jar + nebula-exchange_spark_3.0/target/nebula-exchange_spark_3.0-*-SNAPSHOT.jar diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index c3d96094..affa485f 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -30,29 +30,22 @@ jobs: key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-maven- - - name: download neo4j-contrib & graphframes & pulsar-spark-connector dependency - run: | - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/neo4j-contrib.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/graphframes.zip - wget https://oss-cdn.nebula-graph.com.cn/jar-packages/streamnative.zip - unzip -o -d ~/.m2/repository/ neo4j-contrib.zip - unzip -o -d ~/.m2/repository/ graphframes.zip - rm -rf ~/.m2/repository/io/streamnative - unzip -o -d ~/.m2/repository/io/ streamnative.zip - - name: Install nebula-graph run: | mkdir tmp pushd tmp git clone https://github.com/vesoft-inc/nebula-docker-compose.git pushd nebula-docker-compose/ - cp ../../nebula-exchange/src/test/resources/docker-compose.yaml . + cp ../../exchange-common/src/test/resources/docker-compose.yaml . docker-compose up -d sleep 10 popd popd - name: Build with Maven - run: mvn -B package + run: | + mvn clean package -pl nebula-exchange_spark_2.2 -am -Pscala-2.11 -Pspark-2.2 + mvn clean package -pl nebula-exchange_spark_2.4 -am -Pscala-2.11 -Pspark-2.4 + mvn clean package -pl nebula-exchange_spark_3.0 -am -Pscala-2.12 -Pspark-3.0 - uses: codecov/codecov-action@v2 diff --git a/exchange-common/pom.xml b/exchange-common/pom.xml new file mode 100644 index 00000000..7c3207f5 --- /dev/null +++ b/exchange-common/pom.xml @@ -0,0 +1,252 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>exchange</artifactId> + <groupId>com.vesoft</groupId> + <version>2.5-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>exchange-common</artifactId> + + <properties> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> + <nebula.version>2.0.0-SNAPSHOT</nebula.version> + <scalatest.version>3.2.0</scalatest.version> + <scala-logging.version>3.9.2</scala-logging.version> + </properties> + + <dependencies> + <dependency> + <groupId>com.vesoft</groupId> + <artifactId>client</artifactId> + <version>${nebula.version}</version> + </dependency> + <dependency> + <groupId>org.locationtech.jts</groupId> + <artifactId>jts-core</artifactId> + <version>1.16.1</version> + </dependency> + + <!-- scala dependency --> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_${scala.binary.version}</artifactId> + <version>${scalatest.version}</version> + <scope>test</scope> + </dependency> + + <!-- spark dependency --> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-core_${scala.binary.version}</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql_${scala.binary.version}</artifactId> + <version>${spark.version}</version> + <scope>provided</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <!-- deploy plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.8.2</version> + <executions> + <execution> + <id>default-deploy</id> + <phase>deploy</phase> + <configuration> + <skip>true</skip> + </configuration> + </execution> + </executions> + </plugin> + <!-- skip nexus staging deploy --> + <plugin> + <groupId>org.sonatype.plugins</groupId> + <artifactId>nexus-staging-maven-plugin</artifactId> + <executions> + <execution> + <id>default-deploy</id> + <phase>deploy</phase> + <goals> + <goal>deploy</goal> + </goals> + <configuration> + <serverId>ossrh</serverId> + <skipNexusStagingDeployMojo>true</skipNexusStagingDeployMojo> + </configuration> + </execution> + </executions> + </plugin> + + <!-- Source plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>attach-sources</id> + <goals> + <goal>jar</goal> + </goals> + </execution> + </executions> + </plugin> + <!-- Javadoc plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>attach-javadocs</id> + <phase>package</phase> + <goals> + <goal>jar</goal> + </goals> + <configuration> + <encoding>UTF-8</encoding> + <charset>UTF-8</charset> + <additionalOptions> + <additionalparam>-source 8</additionalparam> + <additionalOption>-Xdoclint:none</additionalOption> + </additionalOptions> + </configuration> + </execution> + </executions> + </plugin> + <!-- test plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.12.4</version> + <configuration> + <includes> + <include>**/*Test.*</include> + <include>**/*Suite.*</include> + </includes> + </configuration> + </plugin> + <plugin> + <groupId>org.scalatest</groupId> + <artifactId>scalatest-maven-plugin</artifactId> + <version>2.0.0</version> + <executions> + <execution> + <id>test</id> + <goals> + <goal>test</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <version>2.15.2</version> + <configuration> + <scalaVersion>${scala.version}</scalaVersion> + <args> + <arg>-target:jvm-1.8</arg> + </args> + <jvmArgs> + <jvmArg>-Xss4096K</jvmArg> + </jvmArgs> + </configuration> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + </goals> + <configuration> + <excludes> + <exclude>com/vesoft/tools/**</exclude> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </configuration> + </execution> + <execution> + <id>scala-test-compile</id> + <goals> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>4.4.0</version> + <executions> + <execution> + <id>Scaladoc</id> + <goals> + <goal>doc</goal> + </goals> + <phase>prepare-package</phase> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + <execution> + <id>attach-javadocs</id> + <goals> + <goal>doc-jar</goal> + </goals> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + <execution> + <id>scala-compile-first</id> + <goals> + <goal>compile</goal> + </goals> + </execution> + </executions> + </plugin> + <!-- jacoco plugin --> + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.8.7</version> + <executions> + <execution> + <goals> + <goal>prepare-agent</goal> + </goals> + </execution> + <execution> + <id>report</id> + <phase>test</phase> + <goals> + <goal>report</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/CheckPointHandler.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/CheckPointHandler.scala similarity index 80% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/CheckPointHandler.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/CheckPointHandler.scala index 8bd2c367..e4b31272 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/CheckPointHandler.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/CheckPointHandler.scala @@ -1,13 +1,12 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common -import com.vesoft.nebula.exchange.config.{SourceCategory} -import com.vesoft.nebula.exchange.utils.HDFSUtils -import com.vesoft.nebula.exchange.config.SchemaConfigEntry +import com.vesoft.exchange.common.config.{SchemaConfigEntry, SourceCategory} +import com.vesoft.exchange.common.utils.HDFSUtils import org.apache.spark.TaskContext /** diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/ErrorHandler.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/ErrorHandler.scala similarity index 88% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/ErrorHandler.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/ErrorHandler.scala index 96a76b8e..e188e8d5 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/ErrorHandler.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/ErrorHandler.scala @@ -1,9 +1,9 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} @@ -46,9 +46,9 @@ object ErrorHandler { */ def save(buffer: ArrayBuffer[String], path: String): Unit = { LOG.info(s"create reload path $path") - val fileSystem = FileSystem.get(new Configuration()) - val targetPath = new Path(path) - val errors = if (fileSystem.exists(targetPath)) { + val fileSystem = FileSystem.get(new Configuration()) + val targetPath = new Path(path) + val errors = if (fileSystem.exists(targetPath)) { // For kafka, the error ngql need to append to a same file instead of overwrite fileSystem.append(targetPath) } else { diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/GraphProvider.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/GraphProvider.scala similarity index 94% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/GraphProvider.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/GraphProvider.scala index 50482d8b..995f84a9 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/GraphProvider.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/GraphProvider.scala @@ -3,9 +3,10 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common import com.google.common.net.HostAndPort +import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry} import com.vesoft.nebula.client.graph.NebulaPoolConfig import com.vesoft.nebula.client.graph.data.{ CASignedSSLParam, @@ -15,7 +16,6 @@ import com.vesoft.nebula.client.graph.data.{ SelfSignedSSLParam } import com.vesoft.nebula.client.graph.net.{NebulaPool, Session} -import com.vesoft.nebula.exchange.config.{SslConfigEntry, SslType, UserConfigEntry} import org.apache.log4j.Logger import scala.collection.JavaConverters._ @@ -39,7 +39,7 @@ class GraphProvider(addresses: List[HostAndPort], timeout: Int, sslConfigEntry: nebulaPoolConfig.setTimeout(timeout) - // config graph ssl + // com.vesoft.exchange.common.config graph ssl nebulaPoolConfig.setEnableSsl(sslConfigEntry.enableGraph) if (sslConfigEntry.enableGraph) { var sslParam: SSLParam = null diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/MetaProvider.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/MetaProvider.scala similarity index 96% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/MetaProvider.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/MetaProvider.scala index 07f182a0..a46a3888 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/MetaProvider.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/MetaProvider.scala @@ -3,9 +3,10 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common import com.google.common.net.HostAndPort +import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type} import com.vesoft.nebula.PropertyType import com.vesoft.nebula.client.graph.data.{ CASignedSSLParam, @@ -14,7 +15,6 @@ import com.vesoft.nebula.client.graph.data.{ SelfSignedSSLParam } import com.vesoft.nebula.client.meta.MetaClient -import com.vesoft.nebula.exchange.config.{SslConfigEntry, SslType, Type} import com.vesoft.nebula.meta.{EdgeItem, TagItem} import org.apache.log4j.Logger @@ -40,7 +40,7 @@ class MetaProvider(addresses: List[HostAndPort], private var metaClient: MetaClient = null var sslParam: SSLParam = null - // config meta ssl + // com.vesoft.exchange.common.config meta ssl if (sslConfigEntry.enableMeta) { if (sslConfigEntry.signType == SslType.CA) { val ca = sslConfigEntry.caSignParam diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/package.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/Package.scala similarity index 86% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/package.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/Package.scala index e4f9ec39..278bf60a 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/package.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/Package.scala @@ -1,17 +1,17 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula +package com.vesoft.exchange import com.google.common.base.Optional import com.google.common.util.concurrent.ListenableFuture -import com.vesoft.nebula.exchange.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils import scala.collection.mutable.ListBuffer -package object exchange { +package object common { type GraphSpaceID = Int type PartitionID = Int @@ -92,3 +92,9 @@ package object exchange { case class Offset(start: Long, size: Long) } + +final case class Argument(config: String = "application.conf", + hive: Boolean = false, + directly: Boolean = false, + dry: Boolean = false, + reload: String = "") diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/Configs.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/Configs.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/Configs.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/config/Configs.scala index 41a041d6..60cdc754 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/Configs.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/Configs.scala @@ -3,23 +3,21 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.config +package com.vesoft.exchange.common.config import java.io.File import java.nio.file.Files import com.google.common.net.HostAndPort -import com.vesoft.nebula.exchange.KeyPolicy import com.typesafe.config.{Config, ConfigFactory} -import com.vesoft.nebula.exchange.Argument -import com.vesoft.nebula.exchange.config.SslType.SslType -import com.vesoft.nebula.exchange.utils.NebulaUtils +import com.vesoft.exchange.Argument +import com.vesoft.exchange.common.KeyPolicy import org.apache.log4j.Logger -import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ListBuffer -import scala.util.control.Breaks._ +import scala.collection.JavaConverters._ +import scala.util.control.Breaks.break object Type extends Enumeration { type Type = Value @@ -365,7 +363,7 @@ object Configs { fields } - // You can specified the vertex field name via the config item `vertex` + // You can specified the vertex field name via the com.vesoft.exchange.common.config item `vertex` // If you want to qualified the key policy, you can wrap them into a block. val vertexField = if (tagConfig.hasPath("vertex.field")) { tagConfig.getString("vertex.field") @@ -587,7 +585,7 @@ object Configs { } /** - * Use to generate data source config according to category of source. + * Use to generate data source com.vesoft.exchange.common.config according to category of source. * * @param category * @param config @@ -740,12 +738,15 @@ object Configs { } else { "1" } - ClickHouseConfigEntry(SourceCategory.CLICKHOUSE, - config.getString("url"), - config.getString("user"), - config.getString("password"), - partition, - config.getString("sentence")) + ClickHouseConfigEntry( + SourceCategory.CLICKHOUSE, + config.getString("url"), + config.getString("user"), + config.getString("password"), + partition, + config.getString("table"), + config.getString("sentence") + ) } case _ => throw new IllegalArgumentException("Unsupported data source") @@ -778,10 +779,10 @@ object Configs { } /** - * Get the config list by the path. + * Get the com.vesoft.exchange.common.config list by the path. * - * @param config The config. - * @param path The path of the config. + * @param config The com.vesoft.exchange.common.config. + * @param path The path of the com.vesoft.exchange.common.config. * @return */ private[this] def getConfigsOrNone(config: Config, @@ -794,7 +795,7 @@ object Configs { } /** - * Get the config by the path. + * Get the com.vesoft.exchange.common.config by the path. * * @param config * @param path @@ -809,10 +810,10 @@ object Configs { } /** - * Get the value from config by the path. If the path not exist, return the default value. + * Get the value from com.vesoft.exchange.common.config by the path. If the path not exist, return the default value. * - * @param config The config. - * @param path The path of the config. + * @param config The com.vesoft.exchange.common.config. + * @param path The path of the com.vesoft.exchange.common.config. * @param defaultValue The default value for the path. * @return */ @@ -833,7 +834,7 @@ object Configs { } /** - * Get the value from config by the path which is optional. + * Get the value from com.vesoft.exchange.common.config by the path which is optional. * If the path not exist, return the default value. * * @param config @@ -865,7 +866,7 @@ object Configs { .required() .valueName("fileName") .action((x, c) => c.copy(config = x)) - .text("config fileName") + .text("com.vesoft.exchange.common.config fileName") opt[Unit]('h', "hive") .action((_, c) => c.copy(hive = true)) diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SchemaConfigs.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SchemaConfigs.scala similarity index 98% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SchemaConfigs.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/config/SchemaConfigs.scala index e92ba070..7a04f24e 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SchemaConfigs.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SchemaConfigs.scala @@ -3,9 +3,9 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.config +package com.vesoft.exchange.common.config -import com.vesoft.nebula.exchange.KeyPolicy +import com.vesoft.exchange.common.KeyPolicy /** * SchemaConfigEntry is tag/edge super class use to save some basic parameter for importer. diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SinkConfigs.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SinkConfigs.scala similarity index 96% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SinkConfigs.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/config/SinkConfigs.scala index e05cf693..92b63a1d 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SinkConfigs.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SinkConfigs.scala @@ -3,7 +3,7 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.config +package com.vesoft.exchange.common.config /** * SinkCategory is used to expression the writer's type. diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SourceConfigs.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SourceConfigs.scala similarity index 96% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SourceConfigs.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/config/SourceConfigs.scala index 2a21c1ed..7fac5fec 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SourceConfigs.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/config/SourceConfigs.scala @@ -3,9 +3,9 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.config +package com.vesoft.exchange.common.config -import com.vesoft.nebula.exchange.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils /** * Category use to explain the data source which the Spark application could reading. @@ -158,7 +158,7 @@ case class MySQLSourceConfigEntry(override val category: SourceCategory.Value, } /** - * TODO: Support more config item about Kafka Consumer + * TODO: Support more com.vesoft.exchange.common.config item about Kafka Consumer * * @param server * @param topic @@ -170,7 +170,7 @@ case class KafkaSourceConfigEntry(override val category: SourceCategory.Value, server: String, topic: String, startingOffsets: String, - maxOffsetsPerTrigger: Option[Long]=None) + maxOffsetsPerTrigger: Option[Long] = None) extends StreamingDataSourceConfigEntry { require(server.trim.nonEmpty && topic.trim.nonEmpty) @@ -257,9 +257,10 @@ case class ClickHouseConfigEntry(override val category: SourceCategory.Value, user: String, passwd: String, numPartition: String, + table: String, override val sentence: String) extends ServerDataSourceConfigEntry { override def toString: String = { - s"ClickHouse source {url:$url, user:$user, passwd:$passwd, numPartition:$numPartition, sentence:$sentence}" + s"ClickHouse source {url:$url, user:$user, passwd:$passwd, numPartition:$numPartition, table:$table, sentence:$sentence}" } } diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/Processor.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/processor/Processor.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/Processor.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/processor/Processor.scala index a727d2c6..9d15820e 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/Processor.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/processor/Processor.scala @@ -3,10 +3,10 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.processor +package com.vesoft.exchange.common.processor -import com.vesoft.nebula.exchange.Vertex -import com.vesoft.nebula.exchange.config.{EdgeConfigEntry, TagConfigEntry} +import com.vesoft.exchange.common.utils.{HDFSUtils, NebulaUtils} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE import com.vesoft.nebula.{ Coordinate, Date, @@ -20,17 +20,14 @@ import com.vesoft.nebula.{ Time, Value } -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE -import com.vesoft.nebula.exchange.utils.{HDFSUtils, NebulaUtils} import org.apache.log4j.Logger import org.apache.spark.sql.Row -import org.apache.spark.sql.types.{IntegerType, LongType, StringType} import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer /** - * processor is a converter. + * com.vesoft.exchange.common.processor is a converter. * It is responsible for converting the dataframe row data into Nebula Graph's vertex or edge, * and submit data to writer. */ diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/ReloadProcessor.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/processor/ReloadProcessor.scala similarity index 85% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/ReloadProcessor.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/processor/ReloadProcessor.scala index fdacb0c6..b954453c 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/ReloadProcessor.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/processor/ReloadProcessor.scala @@ -3,11 +3,12 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.processor +package com.vesoft.exchange.common.processor -import com.vesoft.nebula.exchange.{ErrorHandler, GraphProvider} -import com.vesoft.nebula.exchange.config.Configs -import com.vesoft.nebula.exchange.writer.NebulaGraphClientWriter +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider} +import com.vesoft.exchange.common.GraphProvider +import com.vesoft.exchange.common.config.Configs +import com.vesoft.exchange.common.writer.NebulaGraphClientWriter import org.apache.log4j.Logger import org.apache.spark.TaskContext import org.apache.spark.sql.{DataFrame, Row} @@ -24,7 +25,7 @@ class ReloadProcessor(data: DataFrame, private[this] lazy val LOG = Logger.getLogger(this.getClass) override def process(): Unit = { - data.foreachPartition(processEachPartition(_)) + data.foreachPartition((rows: Iterator[Row]) => processEachPartition(rows)) } private def processEachPartition(iterator: Iterator[Row]): Unit = { diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/HDFSUtils.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/HDFSUtils.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/HDFSUtils.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/utils/HDFSUtils.scala index fea20a0b..4d7000ac 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/HDFSUtils.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/HDFSUtils.scala @@ -1,15 +1,17 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.utils +package com.vesoft.exchange.common.utils import java.io.File import java.nio.charset.Charset + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.Logger + import scala.io.Source object HDFSUtils { diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/NebulaUtils.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaUtils.scala similarity index 92% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/NebulaUtils.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaUtils.scala index 0b4f7579..9aea2295 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/NebulaUtils.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/utils/NebulaUtils.scala @@ -1,15 +1,16 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.utils +package com.vesoft.exchange.common.utils import java.nio.charset.Charset import com.google.common.primitives.UnsignedLong -import com.vesoft.nebula.exchange.{MetaProvider, VidType} -import com.vesoft.nebula.exchange.config.{SchemaConfigEntry, Type} +import com.vesoft.exchange.common.MetaProvider +import com.vesoft.exchange.common.VidType +import com.vesoft.exchange.common.config.{SchemaConfigEntry, Type} import org.apache.commons.codec.digest.MurmurHash2 import org.apache.log4j.Logger diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/FileBaseWriter.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/FileBaseWriter.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/FileBaseWriter.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/writer/FileBaseWriter.scala index b9a19c55..2f42f077 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/FileBaseWriter.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/FileBaseWriter.scala @@ -3,12 +3,13 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.writer +package com.vesoft.exchange.common.writer import java.nio.{ByteBuffer, ByteOrder} import java.nio.file.{Files, Paths} -import com.vesoft.nebula.exchange.config.FileBaseSinkConfigEntry -import com.vesoft.nebula.exchange.utils.HDFSUtils + +import com.vesoft.exchange.common.config.FileBaseSinkConfigEntry +import com.vesoft.exchange.common.utils.HDFSUtils import org.apache.spark.TaskContext import org.apache.spark.sql.Row import org.apache.spark.util.LongAccumulator diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/ServerBaseWriter.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/ServerBaseWriter.scala similarity index 91% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/ServerBaseWriter.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/writer/ServerBaseWriter.scala index 348420da..2638c37a 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/ServerBaseWriter.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/ServerBaseWriter.scala @@ -3,33 +3,21 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.writer +package com.vesoft.exchange.common.writer -import java.util.concurrent.{CountDownLatch, TimeUnit} +import java.util.concurrent.TimeUnit -import com.google.common.base.Optional -import com.google.common.util.concurrent.{FutureCallback, RateLimiter} -import com.vesoft.nebula.ErrorCode -import com.vesoft.nebula.exchange.config.{ - ConnectionConfigEntry, +import com.google.common.util.concurrent.RateLimiter +import com.vesoft.exchange.common.GraphProvider +import com.vesoft.exchange.common.{Edges, KeyPolicy, Vertices} +import com.vesoft.exchange.common.config.{ DataBaseConfigEntry, RateConfigEntry, SchemaConfigEntry, Type, UserConfigEntry } -import com.vesoft.nebula.exchange.utils.HDFSUtils -import com.vesoft.nebula.exchange.{ - Edges, - GraphProvider, - KeyPolicy, - TooManyErrorsException, - Vertices -} import org.apache.log4j.Logger -import org.apache.spark.util.LongAccumulator - -import scala.collection.JavaConverters._ abstract class ServerBaseWriter extends Writer { private[this] val BATCH_INSERT_TEMPLATE = "INSERT %s `%s`(%s) VALUES %s" diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/Writer.scala b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/Writer.scala similarity index 83% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/Writer.scala rename to exchange-common/src/main/scala/com/vesoft/exchange/common/writer/Writer.scala index 5fb2ea84..e8747da5 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/writer/Writer.scala +++ b/exchange-common/src/main/scala/com/vesoft/exchange/common/writer/Writer.scala @@ -3,7 +3,7 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange.writer +package com.vesoft.exchange.common.writer /** * diff --git a/nebula-exchange/src/test/resources/application.conf b/exchange-common/src/test/resources/application.conf similarity index 96% rename from nebula-exchange/src/test/resources/application.conf rename to exchange-common/src/test/resources/application.conf index 447d267a..3ece57a0 100644 --- a/nebula-exchange/src/test/resources/application.conf +++ b/exchange-common/src/test/resources/application.conf @@ -1,5 +1,5 @@ { - # Spark relation config + # Spark relation com.vesoft.exchange.common.config spark: { app: { name: Nebula Exchange 2.0 @@ -22,7 +22,7 @@ } # if the hive is hive-on-spark with derby mode, you can ignore this hive configure - # get the config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml + # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml hive: { warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" @@ -32,7 +32,7 @@ connectionPassword: "password" } - # Nebula Graph relation config + # Nebula Graph relation com.vesoft.exchange.common.config nebula: { address:{ graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] @@ -78,7 +78,7 @@ } # Processing tags - # There are tag config examples for different dataSources. + # There are tag com.vesoft.exchange.common.config examples for different dataSources. tags: [ # HDFS parquet @@ -269,7 +269,7 @@ ] # Processing edges - # There are edge config examples for different dataSources. + # There are edge com.vesoft.exchange.common.config examples for different dataSources. edges: [ # HDFS parquet # Import mode is client, just change type.sink to sst if you want to use sst import mode. diff --git a/nebula-exchange/src/test/resources/docker-compose.yaml b/exchange-common/src/test/resources/docker-compose.yaml similarity index 100% rename from nebula-exchange/src/test/resources/docker-compose.yaml rename to exchange-common/src/test/resources/docker-compose.yaml diff --git a/nebula-exchange/src/test/resources/edge.csv b/exchange-common/src/test/resources/edge.csv similarity index 100% rename from nebula-exchange/src/test/resources/edge.csv rename to exchange-common/src/test/resources/edge.csv diff --git a/nebula-exchange/src/test/resources/process_application.conf b/exchange-common/src/test/resources/process_application.conf similarity index 87% rename from nebula-exchange/src/test/resources/process_application.conf rename to exchange-common/src/test/resources/process_application.conf index 3846e545..8160ab55 100644 --- a/nebula-exchange/src/test/resources/process_application.conf +++ b/exchange-common/src/test/resources/process_application.conf @@ -1,5 +1,5 @@ { - # Spark relation config + # Spark relation com.vesoft.exchange.common.config spark: { app: { name: Nebula Exchange 2.0 @@ -22,7 +22,7 @@ } # if the hive is hive-on-spark with derby mode, you can ignore this hive configure - # get the config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml + # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml hive: { warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" @@ -32,7 +32,7 @@ connectionPassword: "password" } - # Nebula Graph relation config + # Nebula Graph relation com.vesoft.exchange.common.config nebula: { address:{ graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] @@ -78,7 +78,7 @@ } # Processing tags - # There are tag config examples for different dataSources. + # There are tag com.vesoft.exchange.common.config examples for different dataSources. tags: [ { name: person @@ -99,7 +99,7 @@ } ] - # There are tag config examples for different dataSources. + # There are tag com.vesoft.exchange.common.config examples for different dataSources. edges: [ { name: friend diff --git a/nebula-exchange/src/test/resources/vertex.csv b/exchange-common/src/test/resources/vertex.csv similarity index 100% rename from nebula-exchange/src/test/resources/vertex.csv rename to exchange-common/src/test/resources/vertex.csv diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/GraphProviderSuite.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/GraphProviderSuite.scala similarity index 89% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/GraphProviderSuite.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/GraphProviderSuite.scala index 62677637..f2101925 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/GraphProviderSuite.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/GraphProviderSuite.scala @@ -3,15 +3,13 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common import com.google.common.net.HostAndPort +import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, UserConfigEntry} import com.vesoft.nebula.client.graph.net.Session -import com.vesoft.nebula.exchange.config.{SslConfigEntry, SslType, UserConfigEntry} import org.junit.{After, Before, Test} -import scala.com.vesoft.nebula.exchange.NebulaGraphMock - class GraphProviderSuite { var graphProvider: GraphProvider = _ var session: Session = _ diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/MetaProviderSuite.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/MetaProviderSuite.scala similarity index 93% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/MetaProviderSuite.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/MetaProviderSuite.scala index 23979692..297af64f 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/MetaProviderSuite.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/MetaProviderSuite.scala @@ -3,14 +3,12 @@ * This source code is licensed under Apache 2.0 License. */ -package com.vesoft.nebula.exchange +package com.vesoft.exchange.common import com.google.common.net.HostAndPort -import com.vesoft.nebula.exchange.config.{SslConfigEntry, SslType, Type} +import com.vesoft.exchange.common.config.{SslConfigEntry, SslType, Type} import org.junit.{After, Before, Test} -import scala.com.vesoft.nebula.exchange.NebulaGraphMock - class MetaProviderSuite { var metaProvider: MetaProvider = _ diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/NebulaGraphMock.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/NebulaGraphMock.scala similarity index 99% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/NebulaGraphMock.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/NebulaGraphMock.scala index dc866f5f..f087557d 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/NebulaGraphMock.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/NebulaGraphMock.scala @@ -3,7 +3,7 @@ * This source code is licensed under Apache 2.0 License. */ -package scala.com.vesoft.nebula.exchange +package com.vesoft.exchange.common import com.vesoft.nebula.client.graph.NebulaPoolConfig import com.vesoft.nebula.client.graph.data.HostAddress diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/config/ConfigsSuite.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/config/ConfigsSuite.scala similarity index 98% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/config/ConfigsSuite.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/config/ConfigsSuite.scala index 03ca45aa..f4e42482 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/config/ConfigsSuite.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/config/ConfigsSuite.scala @@ -7,7 +7,9 @@ package scala.com.vesoft.nebula.exchange.config import java.io.File -import com.vesoft.nebula.exchange.config.{ +import com.vesoft.exchange.Argument +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{ Configs, DataBaseConfigEntry, FileBaseSourceConfigEntry, @@ -19,7 +21,6 @@ import com.vesoft.nebula.exchange.config.{ SinkCategory, SourceCategory } -import com.vesoft.nebula.exchange.{Argument, KeyPolicy} import org.apache.log4j.Logger import org.junit.Test import org.scalatest.Assertions.assertThrows @@ -243,7 +244,7 @@ class ConfigsSuite { } /** - * correct config + * correct com.vesoft.exchange.common.config */ @Test def dataBaseConfigSuite(): Unit = { diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/ProcessorSuite.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/processor/ProcessorSuite.scala similarity index 99% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/ProcessorSuite.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/processor/ProcessorSuite.scala index c585b744..a7ad77e2 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/ProcessorSuite.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/processor/ProcessorSuite.scala @@ -5,7 +5,7 @@ package scala.com.vesoft.nebula.exchange.processor -import com.vesoft.nebula.exchange.processor.Processor +import com.vesoft.exchange.common.processor.Processor import com.vesoft.nebula.{ Coordinate, Date, diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/utils/NebulaUtilsSuite.scala b/exchange-common/src/test/scala/com/vesoft/exchange/common/utils/NebulaUtilsSuite.scala similarity index 97% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/utils/NebulaUtilsSuite.scala rename to exchange-common/src/test/scala/com/vesoft/exchange/common/utils/NebulaUtilsSuite.scala index 6fe29cea..10aa449f 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/utils/NebulaUtilsSuite.scala +++ b/exchange-common/src/test/scala/com/vesoft/exchange/common/utils/NebulaUtilsSuite.scala @@ -6,24 +6,24 @@ package scala.com.vesoft.nebula.exchange.utils import com.google.common.net.HostAndPort +import com.vesoft.exchange.common.{MetaProvider, NebulaGraphMock, VidType} import com.vesoft.nebula.PropertyType import com.vesoft.nebula.client.graph.NebulaPoolConfig import com.vesoft.nebula.client.graph.data.HostAddress import com.vesoft.nebula.client.graph.net.NebulaPool -import com.vesoft.nebula.exchange.config.{ +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{ NebulaSinkConfigEntry, SinkCategory, SslConfigEntry, TagConfigEntry } -import com.vesoft.nebula.exchange.utils.NebulaUtils -import com.vesoft.nebula.exchange.{KeyPolicy, MetaProvider, VidType} +import com.vesoft.exchange.common.utils.NebulaUtils import org.apache.log4j.Logger import org.junit.{After, Before, Test} import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer -import scala.com.vesoft.nebula.exchange.NebulaGraphMock class NebulaUtilsSuite { private[this] val LOG = Logger.getLogger(this.getClass) diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/KafkaUtils.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/KafkaUtils.scala deleted file mode 100644 index f30a5d79..00000000 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/KafkaUtils.scala +++ /dev/null @@ -1,14 +0,0 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. - * - * This source code is licensed under Apache 2.0 License. - */ - -package com.vesoft.nebula.exchange.utils - -import com.vesoft.nebula.exchange.{Edge, Vertex} - -object KafkaUtils { - - def writeVertices(vertices: Vertex*): Unit = {} - def writeEdge(edges: Edge*): Unit = {} -} diff --git a/nebula-exchange_spark_2.2/pom.xml b/nebula-exchange_spark_2.2/pom.xml new file mode 100644 index 00000000..efa5350c --- /dev/null +++ b/nebula-exchange_spark_2.2/pom.xml @@ -0,0 +1,466 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>exchange</artifactId> + <groupId>com.vesoft</groupId> + <version>2.5-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>nebula-exchange_spark_2.2</artifactId> + + <properties> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> + <spark.version>2.2.0</spark.version> + <version.scala.binary>2.11</version.scala.binary> + <scala.version>2.11.12</scala.version> + <spark-csv.version>1.5.0</spark-csv.version> + <scala-logging.version>3.9.2</scala-logging.version> + <neo.version>2.4.5-M1</neo.version> + <gremlin.version>3.4.6</gremlin.version> + </properties> + + <build> + <testSourceDirectory>src/test</testSourceDirectory> + <plugins> + <!-- deploy-plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.8.2</version> + <executions> + <execution> + <id>default-deploy</id> + <phase>deploy</phase> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <version>2.15.2</version> + <configuration> + <scalaVersion>${scala.version}</scalaVersion> + <args> + <arg>-target:jvm-1.8</arg> + </args> + <jvmArgs> + <jvmArg>-Xss4096K</jvmArg> + </jvmArgs> + </configuration> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + </goals> + <configuration> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </configuration> + </execution> + <execution> + <id>scala-test-compile</id> + <goals> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.12.4</version> + <configuration> + <includes> + <include>**/*Test.*</include> + <include>**/*Suite.*</include> + </includes> + </configuration> + </plugin> + <plugin> + <groupId>org.scalatest</groupId> + <artifactId>scalatest-maven-plugin</artifactId> + <version>2.0.0</version> + <executions> + <execution> + <id>test</id> + <goals> + <goal>test</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>3.2.1</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <artifactSet> + <excludes> + <exclude>org.apache.spark:*</exclude> + <exclude>org.apache.hadoop:*</exclude> + <exclude>org.apache.hive:*</exclude> + <exclude>log4j:log4j</exclude> + <exclude>org.apache.orc:*</exclude> + <exclude>xml-apis:xml-apis</exclude> + <exclude>javax.inject:javax.inject</exclude> + <exclude>org.spark-project.hive:hive-exec</exclude> + <exclude>stax:stax-api</exclude> + <exclude>org.glassfish.hk2.external:aopalliance-repackaged + </exclude> + </excludes> + </artifactSet> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>attach-sources</id> + <goals> + <goal>jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>4.4.0</version> + <executions> + <execution> + <id>Scaladoc</id> + <goals> + <goal>doc</goal> + </goals> + <phase>prepare-package</phase> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + <execution> + <id>attach-javadocs</id> + <goals> + <goal>doc-jar</goal> + </goals> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>3.2.0</version> + <configuration> + <excludePackageNames>com.facebook.thrift:com.facebook.thrift.* + </excludePackageNames> + </configuration> + <executions> + <execution> + <id>attach-javadocs</id> + <phase>package</phase> + <goals> + <goal>jar</goal> + </goals> + <configuration> + <encoding>UTF-8</encoding> + <charset>UTF-8</charset> + <additionalOptions> + <additionalparam>-source 8</additionalparam> + <additionalOption>-Xdoclint:none</additionalOption> + </additionalOptions> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.8.7</version> + <executions> + <execution> + <goals> + <goal>prepare-agent</goal> + </goals> + </execution> + <execution> + <id>report</id> + <phase>test</phase> + <goals> + <goal>report</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + + <dependencies> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-catalyst_2.11</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>chill_2.11</artifactId> + <groupId>com.twitter</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-hive_2.11</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>commons-codec</artifactId> + <groupId>commons-codec</groupId> + </exclusion> + <exclusion> + <artifactId>commons-logging</artifactId> + <groupId>commons-logging</groupId> + </exclusion> + <exclusion> + <artifactId>avro</artifactId> + <groupId>org.apache.avro</groupId> + </exclusion> + <exclusion> + <artifactId>commons-compress</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>commons-lang3</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>jackson-mapper-asl</artifactId> + <groupId>org.codehaus.jackson</groupId> + </exclusion> + <exclusion> + <artifactId>antlr-runtime</artifactId> + <groupId>org.antlr</groupId> + </exclusion> + <exclusion> + <artifactId>jackson-core-asl</artifactId> + <groupId>org.codehaus.jackson</groupId> + </exclusion> + <exclusion> + <artifactId>derby</artifactId> + <groupId>org.apache.derby</groupId> + </exclusion> + <exclusion> + <artifactId>httpclient</artifactId> + <groupId>org.apache.httpcomponents</groupId> + </exclusion> + <exclusion> + <artifactId>httpcore</artifactId> + <groupId>org.apache.httpcomponents</groupId> + </exclusion> + <exclusion> + <artifactId>commons-io</artifactId> + <groupId>commons-io</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-yarn_2.11</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + <exclusion> + <artifactId>commons-codec</artifactId> + <groupId>commons-codec</groupId> + </exclusion> + <exclusion> + <artifactId>commons-compress</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>activation</artifactId> + <groupId>javax.activation</groupId> + </exclusion> + <exclusion> + <artifactId>slf4j-api</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + <exclusion> + <artifactId>commons-io</artifactId> + <groupId>commons-io</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.databricks</groupId> + <artifactId>spark-csv_2.11</artifactId> + <version>${spark-csv.version}</version> + <exclusions> + <exclusion> + <artifactId>scala-library</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>univocity-parsers</artifactId> + <groupId>com.univocity</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.11</artifactId> + <version>${scalatest.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>com.typesafe.scala-logging</groupId> + <artifactId>scala-logging_2.11</artifactId> + <version>${scala-logging.version}</version> + <exclusions> + <exclusion> + <artifactId>scala-library</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>scala-reflect</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>slf4j-api</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>neo4j-contrib</groupId> + <artifactId>neo4j-spark-connector</artifactId> + <version>${neo.version}</version> + <exclusions> + <exclusion> + <artifactId>scala-library</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>slf4j-api</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + <exclusion> + <artifactId>xbean-asm6-shaded</artifactId> + <groupId>org.apache.xbean</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.tinkerpop</groupId> + <artifactId>spark-gremlin</artifactId> + <version>${gremlin.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + <exclusion> + <artifactId>commons-lang3</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>jcl-over-slf4j</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + <exclusion> + <artifactId>commons-codec</artifactId> + <groupId>commons-codec</groupId> + </exclusion> + <exclusion> + <artifactId>spark-core_2.11</artifactId> + <groupId>org.apache.spark</groupId> + </exclusion> + <exclusion> + <artifactId>scala-xml_2.11</artifactId> + <groupId>org.scala-lang.modules</groupId> + </exclusion> + <exclusion> + <artifactId>scala-library</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>jackson-databind</artifactId> + <groupId>com.fasterxml.jackson.core</groupId> + </exclusion> + <exclusion> + <artifactId>hadoop-client</artifactId> + <groupId>org.apache.hadoop</groupId> + </exclusion> + <exclusion> + <artifactId>netty-all</artifactId> + <groupId>io.netty</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.aliyun.odps</groupId> + <artifactId>odps-spark-datasource_2.11</artifactId> + <version>3.3.8-public</version> + <exclusions> + <exclusion> + <artifactId>scala-actors</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.vesoft</groupId> + <artifactId>exchange-common</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + <repositories> + <repository> + <id>SparkPackagesRepo</id> + <url>https://repos.spark-packages.org</url> + </repository> + <repository> + <id>snapshots</id> + <url>https://oss.sonatype.org/content/repositories/snapshots/</url> + </repository> + </repositories> + +</project> diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala similarity index 91% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala rename to nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala index 903209fa..3c8a4653 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala @@ -7,7 +7,10 @@ package com.vesoft.nebula.exchange import org.apache.spark.sql.{DataFrame, SparkSession} import java.io.File -import com.vesoft.nebula.exchange.config.{ + +import com.vesoft.exchange.Argument +import com.vesoft.exchange.common.{CheckPointHandler, ErrorHandler} +import com.vesoft.exchange.common.config.{ ClickHouseConfigEntry, Configs, DataSourceConfigEntry, @@ -23,7 +26,6 @@ import com.vesoft.nebula.exchange.config.{ SinkCategory, SourceCategory } -import com.vesoft.nebula.exchange.processor.{EdgeProcessor, VerticesProcessor} import com.vesoft.nebula.exchange.reader.{ CSVReader, ClickhouseReader, @@ -39,16 +41,11 @@ import com.vesoft.nebula.exchange.reader.{ ParquetReader, PulsarReader } -import com.vesoft.nebula.exchange.processor.ReloadProcessor +import com.vesoft.exchange.common.processor.ReloadProcessor +import com.vesoft.nebula.exchange.processor.{EdgeProcessor, VerticesProcessor} import org.apache.log4j.Logger import org.apache.spark.SparkConf -final case class Argument(config: String = "application.conf", - hive: Boolean = false, - directly: Boolean = false, - dry: Boolean = false, - reload: String = "") - final case class TooManyErrorsException(private val message: String) extends Exception(message) /** @@ -82,10 +79,11 @@ object Exchange { val sparkConf = new SparkConf() sparkConf.registerKryoClasses(Array(classOf[com.facebook.thrift.async.TAsyncClientManager])) - // config hive for sparkSession + // com.vesoft.exchange.common.config hive for sparkSession if (c.hive) { if (configs.hiveConfigEntry.isEmpty) { - LOG.info("you don't config hive source, so using hive tied with spark.") + LOG.info( + "you don't com.vesoft.exchange.common.config hive source, so using hive tied with spark.") } else { val hiveConfig = configs.hiveConfigEntry.get sparkConf.set("spark.sql.warehouse.dir", hiveConfig.warehouse) @@ -132,8 +130,8 @@ object Exchange { val nebulaKeys = tagConfig.nebulaFields LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") - val fields = tagConfig.vertexField::tagConfig.fields - val data = createDataSource(spark, tagConfig.dataSourceConfigEntry, fields) + val fields = tagConfig.vertexField :: tagConfig.fields + val data = createDataSource(spark, tagConfig.dataSourceConfigEntry, fields) if (data.isDefined && !c.dry) { val startTime = System.currentTimeMillis() val batchSuccess = @@ -176,9 +174,9 @@ object Exchange { val nebulaKeys = edgeConfig.nebulaFields LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") val fields = if (edgeConfig.rankingField.isDefined) { - edgeConfig.rankingField.get::edgeConfig.sourceField::edgeConfig.targetField::edgeConfig.fields + edgeConfig.rankingField.get :: edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields } else { - edgeConfig.sourceField::edgeConfig.targetField::edgeConfig.fields + edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields } val data = createDataSource(spark, edgeConfig.dataSourceConfigEntry, fields) if (data.isDefined && !c.dry) { @@ -233,7 +231,7 @@ object Exchange { * Create data source for different data type. * * @param session The Spark Session. - * @param config The config. + * @param config The com.vesoft.exchange.common.config. * @return */ private[this] def createDataSource( @@ -276,17 +274,17 @@ object Exchange { } case SourceCategory.NEO4J => val neo4jConfig = config.asInstanceOf[Neo4JSourceConfigEntry] - LOG.info(s"Loading from neo4j config: ${neo4jConfig}") + LOG.info(s"Loading from neo4j com.vesoft.exchange.common.config: ${neo4jConfig}") val reader = new Neo4JReader(session, neo4jConfig) Some(reader.read()) case SourceCategory.MYSQL => val mysqlConfig = config.asInstanceOf[MySQLSourceConfigEntry] - LOG.info(s"Loading from mysql config: ${mysqlConfig}") + LOG.info(s"Loading from mysql com.vesoft.exchange.common.config: ${mysqlConfig}") val reader = new MySQLReader(session, mysqlConfig) Some(reader.read()) case SourceCategory.PULSAR => val pulsarConfig = config.asInstanceOf[PulsarSourceConfigEntry] - LOG.info(s"Loading from pulsar config: ${pulsarConfig}") + LOG.info(s"Loading from pulsar com.vesoft.exchange.common.config: ${pulsarConfig}") val reader = new PulsarReader(session, pulsarConfig) Some(reader.read()) case SourceCategory.JANUS_GRAPH => diff --git a/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala new file mode 100644 index 00000000..ebe6ef6f --- /dev/null +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala @@ -0,0 +1,384 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.nio.ByteOrder + +import com.google.common.geometry.{S2CellId, S2LatLng} +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{Edge, Edges, KeyPolicy} +import com.vesoft.exchange.common.config.{ + Configs, + EdgeConfigEntry, + FileBaseSinkConfigEntry, + SinkCategory +} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl +import com.vesoft.nebula.meta.EdgeItem +import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.log4j.Logger +import org.apache.spark.TaskContext +import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.{DataFrame, Encoders, Row} +import org.apache.spark.util.LongAccumulator + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +class EdgeProcessor(data: DataFrame, + edgeConfig: EdgeConfigEntry, + fieldKeys: List[String], + nebulaKeys: List[String], + config: Configs, + batchSuccess: LongAccumulator, + batchFailure: LongAccumulator) + extends Processor { + + @transient + private[this] lazy val LOG = Logger.getLogger(this.getClass) + + private[this] val DEFAULT_MIN_CELL_LEVEL = 10 + private[this] val DEFAULT_MAX_CELL_LEVEL = 18 + + private def processEachPartition(iterator: Iterator[Edge]): Unit = { + val graphProvider = + new GraphProvider(config.databaseConfig.getGraphAddress, + config.connectionConfig.timeout, + config.sslConfig) + val writer = new NebulaGraphClientWriter(config.databaseConfig, + config.userConfig, + config.rateConfig, + edgeConfig, + graphProvider) + val errorBuffer = ArrayBuffer[String]() + + writer.prepare() + // batch write tags + val startTime = System.currentTimeMillis + iterator.grouped(edgeConfig.batch).foreach { edge => + val edges = Edges(nebulaKeys, edge.toList, edgeConfig.sourcePolicy, edgeConfig.targetPolicy) + val failStatement = writer.writeEdges(edges) + if (failStatement == null) { + batchSuccess.add(1) + } else { + errorBuffer.append(failStatement) + batchFailure.add(1) + } + } + if (errorBuffer.nonEmpty) { + ErrorHandler.save( + errorBuffer, + s"${config.errorConfig.errorPath}/${edgeConfig.name}.${TaskContext.getPartitionId}") + errorBuffer.clear() + } + LOG.info(s"edge ${edgeConfig.name} import in spark partition ${TaskContext + .getPartitionId()} cost ${System.currentTimeMillis() - startTime}ms") + writer.close() + graphProvider.close() + } + + override def process(): Unit = { + + val address = config.databaseConfig.getMetaAddress + val space = config.databaseConfig.space + + val timeout = config.connectionConfig.timeout + val retry = config.connectionConfig.retry + val metaProvider = new MetaProvider(address, timeout, retry, config.sslConfig) + val fieldTypeMap = NebulaUtils.getDataSourceFieldType(edgeConfig, space, metaProvider) + val isVidStringType = metaProvider.getVidType(space) == VidType.STRING + val partitionNum = metaProvider.getPartNumber(space) + + if (edgeConfig.dataSinkConfigEntry.category == SinkCategory.SST) { + val fileBaseConfig = edgeConfig.dataSinkConfigEntry.asInstanceOf[FileBaseSinkConfigEntry] + val namenode = fileBaseConfig.fsName.orNull + val edgeName = edgeConfig.name + + val vidType = metaProvider.getVidType(space) + val spaceVidLen = metaProvider.getSpaceVidLen(space) + val edgeItem = metaProvider.getEdgeItem(space, edgeName) + + val distintData = if (edgeConfig.rankingField.isDefined) { + data.dropDuplicates(edgeConfig.sourceField, + edgeConfig.targetField, + edgeConfig.rankingField.get) + } else { + data.dropDuplicates(edgeConfig.sourceField, edgeConfig.targetField) + } + distintData + .mapPartitions { iter => + iter.map { row => + encodeEdge(row, partitionNum, vidType, spaceVidLen, edgeItem, fieldTypeMap) + } + }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY, Encoders.BINARY)) + .flatMap(line => { + List((line._1, line._3), (line._2, line._3)) + })(Encoders.tuple(Encoders.BINARY, Encoders.BINARY)) + .toDF("key", "value") + .sortWithinPartitions("key") + .foreachPartition { iterator: Iterator[Row] => + val sstFileWriter = new NebulaSSTWriter + sstFileWriter.writeSstFiles(iterator, + fileBaseConfig, + partitionNum, + namenode, + batchFailure) + } + } else { + val streamFlag = data.isStreaming + val edgeFrame = data + .filter { row => + isEdgeValid(row, edgeConfig, streamFlag, isVidStringType) + } + .map { row => + convertToEdge(row, edgeConfig, isVidStringType, fieldKeys, fieldTypeMap) + }(Encoders.kryo[Edge]) + + edgeFrame.foreachPartition(processEachPartition _) + } + } + + private[this] def indexCells(lat: Double, lng: Double): IndexedSeq[Long] = { + val coordinate = S2LatLng.fromDegrees(lat, lng) + val s2CellId = S2CellId.fromLatLng(coordinate) + for (index <- DEFAULT_MIN_CELL_LEVEL to DEFAULT_MAX_CELL_LEVEL) + yield s2CellId.parent(index).id() + } + + /** + * filter and check row data for edge, if streaming only print log + */ + def isEdgeValid(row: Row, + edgeConfig: EdgeConfigEntry, + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val sourceFlag = checkField(edgeConfig.sourceField, + "source_field", + row, + edgeConfig.sourcePolicy, + streamFlag, + isVidStringType) + + val targetFlag = checkField(edgeConfig.targetField, + "target_field", + row, + edgeConfig.targetPolicy, + streamFlag, + isVidStringType) + + val edgeRankFlag = if (edgeConfig.rankingField.isDefined) { + val index = row.schema.fieldIndex(edgeConfig.rankingField.get) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"rank must exist and cannot be null, your row data is $row") + } + val ranking = row.get(index).toString + if (!NebulaUtils.isNumic(ranking)) { + printChoice(streamFlag, + s"Not support non-Numeric type for ranking field.your row data is $row") + false + } else true + } else true + sourceFlag && targetFlag && edgeRankFlag + } + + /** + * check if edge source id and target id valid + */ + def checkField(field: String, + fieldType: String, + row: Row, + policy: Option[KeyPolicy.Value], + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val fieldValue = if (edgeConfig.isGeo && "source_field".equals(fieldType)) { + val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get)) + val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get)) + Some(indexCells(lat, lng).mkString(",")) + } else { + val index = row.schema.fieldIndex(field) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"$fieldType must exist and cannot be null, your row data is $row") + None + } else Some(row.get(index).toString) + } + + val idFlag = fieldValue.isDefined + val policyFlag = + if (idFlag && policy.isEmpty && !isVidStringType + && !NebulaUtils.isNumic(fieldValue.get)) { + printChoice( + streamFlag, + s"space vidType is int, but your $fieldType $fieldValue is not numeric.your row data is $row") + false + } else if (idFlag && policy.isDefined && isVidStringType) { + printChoice( + streamFlag, + s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row") + false + } else true + idFlag && policyFlag + } + + /** + * convert row data to {@link Edge} + */ + def convertToEdge(row: Row, + edgeConfig: EdgeConfigEntry, + isVidStringType: Boolean, + fieldKeys: List[String], + fieldTypeMap: Map[String, Int]): Edge = { + val sourceField = processField(edgeConfig.sourceField, + "source_field", + row, + edgeConfig.sourcePolicy, + isVidStringType) + + val targetField = processField(edgeConfig.targetField, + "target_field", + row, + edgeConfig.targetPolicy, + isVidStringType) + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield extraValueForClient(row, property, fieldTypeMap) + + if (edgeConfig.rankingField.isDefined) { + val index = row.schema.fieldIndex(edgeConfig.rankingField.get) + val ranking = row.get(index).toString + Edge(sourceField, targetField, Some(ranking.toLong), values) + } else { + Edge(sourceField, targetField, None, values) + } + } + + /** + * process edge source and target field + */ + def processField(field: String, + fieldType: String, + row: Row, + policy: Option[KeyPolicy.Value], + isVidStringType: Boolean): String = { + var fieldValue = if (edgeConfig.isGeo && "source_field".equals(fieldType)) { + val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get)) + val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get)) + indexCells(lat, lng).mkString(",") + } else { + val index = row.schema.fieldIndex(field) + val value = row.get(index).toString + if (value.equals(DEFAULT_EMPTY_VALUE)) "" else value + } + // process string type vid + if (policy.isEmpty && isVidStringType) { + fieldValue = NebulaUtils.escapeUtil(fieldValue).mkString("\"", "", "\"") + } + fieldValue + } + + /** + * encode edge + */ + def encodeEdge(row: Row, + partitionNum: Int, + vidType: VidType.Value, + spaceVidLen: Int, + edgeItem: EdgeItem, + fieldTypeMap: Map[String, Int]): (Array[Byte], Array[Byte], Array[Byte]) = { + isEdgeValid(row, edgeConfig, false, vidType == VidType.STRING) + + val srcIndex: Int = row.schema.fieldIndex(edgeConfig.sourceField) + var srcId: String = row.get(srcIndex).toString + if (srcId.equals(DEFAULT_EMPTY_VALUE)) { srcId = "" } + + val dstIndex: Int = row.schema.fieldIndex(edgeConfig.targetField) + var dstId: String = row.get(dstIndex).toString + if (dstId.equals(DEFAULT_EMPTY_VALUE)) { dstId = "" } + + if (edgeConfig.sourcePolicy.isDefined) { + edgeConfig.sourcePolicy.get match { + case KeyPolicy.HASH => + srcId = MurmurHash2 + .hash64(srcId.getBytes(), srcId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${edgeConfig.sourcePolicy.get} is invalidate") + } + } + if (edgeConfig.targetPolicy.isDefined) { + edgeConfig.targetPolicy.get match { + case KeyPolicy.HASH => + dstId = MurmurHash2 + .hash64(dstId.getBytes(), dstId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${edgeConfig.targetPolicy.get} is invalidate") + } + } + + val ranking: Long = if (edgeConfig.rankingField.isDefined) { + val rankIndex = row.schema.fieldIndex(edgeConfig.rankingField.get) + row.get(rankIndex).toString.toLong + } else { + 0 + } + + val srcPartitionId = NebulaUtils.getPartitionId(srcId, partitionNum, vidType) + val dstPartitionId = NebulaUtils.getPartitionId(dstId, partitionNum, vidType) + val codec = new NebulaCodecImpl() + + import java.nio.ByteBuffer + val srcBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(srcId.toLong) + .array + } else { + srcId.getBytes() + } + + val dstBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(dstId.toLong) + .array + } else { + dstId.getBytes() + } + val positiveEdgeKey = codec.edgeKeyByDefaultVer(spaceVidLen, + srcPartitionId, + srcBytes, + edgeItem.getEdge_type, + ranking, + dstBytes) + val reverseEdgeKey = codec.edgeKeyByDefaultVer(spaceVidLen, + dstPartitionId, + dstBytes, + -edgeItem.getEdge_type, + ranking, + srcBytes) + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield + extraValueForSST(row, property, fieldTypeMap) + .asInstanceOf[AnyRef] + + val edgeValue = codec.encodeEdge(edgeItem, nebulaKeys.asJava, values.asJava) + (positiveEdgeKey, reverseEdgeKey, edgeValue) + } +} diff --git a/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala new file mode 100644 index 00000000..98e9c26c --- /dev/null +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala @@ -0,0 +1,255 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.nio.ByteOrder + +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{KeyPolicy, Vertex, Vertices} +import com.vesoft.exchange.common.config.{ + Configs, + FileBaseSinkConfigEntry, + SinkCategory, + TagConfigEntry +} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl +import com.vesoft.nebula.meta.TagItem +import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.log4j.Logger +import org.apache.spark.TaskContext +import org.apache.spark.sql.{DataFrame, Encoders, Row} +import org.apache.spark.util.LongAccumulator + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +/** + * + * @param data + * @param tagConfig + * @param fieldKeys + * @param nebulaKeys + * @param config + * @param batchSuccess + * @param batchFailure + */ +class VerticesProcessor(data: DataFrame, + tagConfig: TagConfigEntry, + fieldKeys: List[String], + nebulaKeys: List[String], + config: Configs, + batchSuccess: LongAccumulator, + batchFailure: LongAccumulator) + extends Processor { + + @transient + private[this] lazy val LOG = Logger.getLogger(this.getClass) + + private def processEachPartition(iterator: Iterator[Vertex]): Unit = { + val graphProvider = + new GraphProvider(config.databaseConfig.getGraphAddress, + config.connectionConfig.timeout, + config.sslConfig) + + val writer = new NebulaGraphClientWriter(config.databaseConfig, + config.userConfig, + config.rateConfig, + tagConfig, + graphProvider) + + val errorBuffer = ArrayBuffer[String]() + + writer.prepare() + // batch write tags + val startTime = System.currentTimeMillis + iterator.grouped(tagConfig.batch).foreach { vertex => + val vertices = Vertices(nebulaKeys, vertex.toList, tagConfig.vertexPolicy) + val failStatement = writer.writeVertices(vertices) + if (failStatement == null) { + batchSuccess.add(1) + } else { + errorBuffer.append(failStatement) + batchFailure.add(1) + } + } + if (errorBuffer.nonEmpty) { + ErrorHandler.save( + errorBuffer, + s"${config.errorConfig.errorPath}/${tagConfig.name}.${TaskContext.getPartitionId()}") + errorBuffer.clear() + } + LOG.info(s"tag ${tagConfig.name} import in spark partition ${TaskContext + .getPartitionId()} cost ${System.currentTimeMillis() - startTime} ms") + writer.close() + graphProvider.close() + } + + override def process(): Unit = { + + val address = config.databaseConfig.getMetaAddress + val space = config.databaseConfig.space + + val timeout = config.connectionConfig.timeout + val retry = config.connectionConfig.retry + val metaProvider = new MetaProvider(address, timeout, retry, config.sslConfig) + val fieldTypeMap = NebulaUtils.getDataSourceFieldType(tagConfig, space, metaProvider) + val isVidStringType = metaProvider.getVidType(space) == VidType.STRING + val partitionNum = metaProvider.getPartNumber(space) + + if (tagConfig.dataSinkConfigEntry.category == SinkCategory.SST) { + val fileBaseConfig = tagConfig.dataSinkConfigEntry.asInstanceOf[FileBaseSinkConfigEntry] + val namenode = fileBaseConfig.fsName.orNull + val tagName = tagConfig.name + val vidType = metaProvider.getVidType(space) + + val spaceVidLen = metaProvider.getSpaceVidLen(space) + val tagItem = metaProvider.getTagItem(space, tagName) + + data + .dropDuplicates(tagConfig.vertexField) + .mapPartitions { iter => + iter.map { row => + encodeVertex(row, partitionNum, vidType, spaceVidLen, tagItem, fieldTypeMap) + } + }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY)) + .toDF("key", "value") + .sortWithinPartitions("key") + .foreachPartition { iterator: Iterator[Row] => + val sstFileWriter = new NebulaSSTWriter + sstFileWriter.writeSstFiles(iterator, + fileBaseConfig, + partitionNum, + namenode, + batchFailure) + } + } else { + val streamFlag = data.isStreaming + val vertices = data + .filter { row => + isVertexValid(row, tagConfig, streamFlag, isVidStringType) + } + .map { row => + convertToVertex(row, tagConfig, isVidStringType, fieldKeys, fieldTypeMap) + }(Encoders.kryo[Vertex]) + + vertices.foreachPartition(processEachPartition _) + } + } + + /** + * filter and check row data for vertex, if streaming only print log + * for not streaming datasource, if the vertex data is invalid, throw AssertException. + */ + def isVertexValid(row: Row, + tagConfig: TagConfigEntry, + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val index = row.schema.fieldIndex(tagConfig.vertexField) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"vertexId must exist and cannot be null, your row data is $row") + return false + } + + val vertexId = row.get(index).toString + // process int type vid + if (tagConfig.vertexPolicy.isEmpty && !isVidStringType && !NebulaUtils.isNumic(vertexId)) { + printChoice( + streamFlag, + s"space vidType is int, but your vertex id $vertexId is not numeric.your row data is $row") + return false + } + // process string type vid + if (tagConfig.vertexPolicy.isDefined && isVidStringType) { + printChoice( + streamFlag, + s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row") + return false + } + true + } + + /** + * Convert row data to {@link Vertex} + */ + def convertToVertex(row: Row, + tagConfig: TagConfigEntry, + isVidStringType: Boolean, + fieldKeys: List[String], + fieldTypeMap: Map[String, Int]): Vertex = { + val index = row.schema.fieldIndex(tagConfig.vertexField) + var vertexId = row.get(index).toString + if (vertexId.equals(DEFAULT_EMPTY_VALUE)) { + vertexId = "" + } + + if (tagConfig.vertexPolicy.isEmpty && isVidStringType) { + vertexId = NebulaUtils.escapeUtil(vertexId).mkString("\"", "", "\"") + } + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield extraValueForClient(row, property, fieldTypeMap) + Vertex(vertexId, values) + } + + /** + * encode vertex + */ + def encodeVertex(row: Row, + partitionNum: Int, + vidType: VidType.Value, + spaceVidLen: Int, + tagItem: TagItem, + fieldTypeMap: Map[String, Int]): (Array[Byte], Array[Byte]) = { + // check if vertex id is valid, if not, throw AssertException + isVertexValid(row, tagConfig, false, vidType == VidType.STRING) + + val index: Int = row.schema.fieldIndex(tagConfig.vertexField) + var vertexId: String = row.get(index).toString + if (vertexId.equals(DEFAULT_EMPTY_VALUE)) { + vertexId = "" + } + if (tagConfig.vertexPolicy.isDefined) { + tagConfig.vertexPolicy.get match { + case KeyPolicy.HASH => + vertexId = MurmurHash2 + .hash64(vertexId.getBytes(), vertexId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${tagConfig.vertexPolicy.get} is invalidate") + } + } + + val partitionId = NebulaUtils.getPartitionId(vertexId, partitionNum, vidType) + + import java.nio.ByteBuffer + val vidBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(vertexId.toLong) + .array + } else { + vertexId.getBytes() + } + val codec = new NebulaCodecImpl() + val vertexKey = codec.vertexKey(spaceVidLen, partitionId, vidBytes, tagItem.getTag_id) + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield + extraValueForSST(row, property, fieldTypeMap) + .asInstanceOf[AnyRef] + val vertexValue = codec.encodeTag(tagItem, nebulaKeys.asJava, values.asJava) + (vertexKey, vertexValue) + } +} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala similarity index 96% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala rename to nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala index 8d66b3d1..2cd7e476 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala @@ -5,8 +5,8 @@ package com.vesoft.nebula.exchange.reader -import com.vesoft.nebula.exchange.config.FileBaseSourceConfigEntry -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala rename to nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala index 6c68b12b..fb8455e6 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala @@ -5,8 +5,8 @@ package com.vesoft.nebula.exchange.reader -import com.vesoft.nebula.exchange.Offset -import com.vesoft.nebula.exchange.utils.HDFSUtils +import com.vesoft.exchange.common.Offset +import com.vesoft.exchange.common.utils.HDFSUtils import org.apache.spark.sql.{DataFrame, SparkSession} /** diff --git a/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala new file mode 100644 index 00000000..7380374d --- /dev/null +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala @@ -0,0 +1,257 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.google.common.collect.Maps +import com.vesoft.exchange.common.config.{ + ClickHouseConfigEntry, + HBaseSourceConfigEntry, + HiveSourceConfigEntry, + JanusGraphSourceConfigEntry, + MaxComputeConfigEntry, + MySQLSourceConfigEntry, + Neo4JSourceConfigEntry, + ServerDataSourceConfigEntry +} +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.Result +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.mapreduce.TableInputFormat +import org.apache.hadoop.hbase.util.Bytes +import org.apache.log4j.Logger +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{DataTypes, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.tinkerpop.gremlin.process.computer.clustering.peerpressure.{ + ClusterCountMapReduce, + PeerPressureVertexProgram +} +import org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer +import org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD +import org.apache.tinkerpop.gremlin.structure.util.GraphFactory +import org.neo4j.driver.internal.types.{TypeConstructor, TypeRepresentation} +import org.neo4j.driver.{AuthTokens, GraphDatabase} +import org.neo4j.spark.dataframe.CypherTypes +import org.neo4j.spark.utils.Neo4jSessionAwareIterator +import org.neo4j.spark.{Executor, Neo4jConfig} + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +/** + * ServerBaseReader is the abstract class of + * It include a spark session and a sentence which will sent to service. + * @param session + * @param sentence + */ +abstract class ServerBaseReader(override val session: SparkSession, val sentence: String) + extends Reader { + + override def close(): Unit = { + session.close() + } +} + +/** + * HiveReader extends the @{link ServerBaseReader}. + * The HiveReader reading data from Apache Hive via sentence. + * @param session + * @param hiveConfig + */ +class HiveReader(override val session: SparkSession, hiveConfig: HiveSourceConfigEntry) + extends ServerBaseReader(session, hiveConfig.sentence) { + override def read(): DataFrame = { + session.sql(sentence) + } +} + +/** + * The MySQLReader extends the ServerBaseReader. + * The MySQLReader reading data from MySQL via sentence. + * + * @param session + * @param mysqlConfig + */ +class MySQLReader(override val session: SparkSession, mysqlConfig: MySQLSourceConfigEntry) + extends ServerBaseReader(session, mysqlConfig.sentence) { + override def read(): DataFrame = { + val url = + s"jdbc:mysql://${mysqlConfig.host}:${mysqlConfig.port}/${mysqlConfig.database}?useUnicode=true&characterEncoding=utf-8" + val df = session.read + .format("jdbc") + .option("url", url) + .option("dbtable", mysqlConfig.table) + .option("user", mysqlConfig.user) + .option("password", mysqlConfig.password) + .load() + df.createOrReplaceTempView(mysqlConfig.table) + session.sql(sentence) + } +} + +/** + * Neo4JReader extends the ServerBaseReader + * this reader support checkpoint by sacrificing performance + * @param session + * @param neo4jConfig + */ +class Neo4JReader(override val session: SparkSession, neo4jConfig: Neo4JSourceConfigEntry) + extends ServerBaseReader(session, neo4jConfig.sentence) + with CheckPointSupport { + + @transient lazy private val LOG = Logger.getLogger(this.getClass) + + override def read(): DataFrame = { + throw new UnsupportedOperationException("Neo4j datasource is not support yet for spark2.2") + } +} + +/** + * JanusGraphReader extends the link ServerBaseReader + * @param session + * @param janusGraphConfig + */ +class JanusGraphReader(override val session: SparkSession, + janusGraphConfig: JanusGraphSourceConfigEntry) + extends ServerBaseReader(session, "") + with CheckPointSupport { + + override def read(): DataFrame = { + val graph = GraphFactory.open("conf/hadoop/hadoop-gryo.properties") + graph.configuration().setProperty("gremlin.hadoop.graphWriter", classOf[PersistedOutputRDD]) + graph.configuration().setProperty("gremlin.spark.persistContext", true) + + val result = graph + .compute(classOf[SparkGraphComputer]) + .program(PeerPressureVertexProgram.build().create(graph)) + .mapReduce(ClusterCountMapReduce.build().memoryKey("clusterCount").create()) + .submit() + .get() + + if (janusGraphConfig.isEdge) { + result.graph().edges() + } else { + result.graph().variables().asMap() + } + null + } +} + +/** + * + * @param session + * @param nebulaConfig + */ +class NebulaReader(override val session: SparkSession, nebulaConfig: ServerDataSourceConfigEntry) + extends ServerBaseReader(session, nebulaConfig.sentence) { + override def read(): DataFrame = ??? +} + +/** + * HBaseReader extends [[ServerBaseReader]] + * + */ +class HBaseReader(override val session: SparkSession, hbaseConfig: HBaseSourceConfigEntry) + extends ServerBaseReader(session, null) { + + private[this] val LOG = Logger.getLogger(this.getClass) + + override def read(): DataFrame = { + val cf = hbaseConfig.columnFamily + val scanConf = HBaseConfiguration.create() + scanConf.set("hbase.zookeeper.quorum", hbaseConfig.host) + scanConf.set("hbase.zookeeper.property.clientPort", hbaseConfig.port) + scanConf.set(TableInputFormat.INPUT_TABLE, hbaseConfig.table) + hbaseConfig.fields.filter(field => !field.equalsIgnoreCase("rowkey")) + scanConf.set(TableInputFormat.SCAN_COLUMNS, + hbaseConfig.fields + .filter(field => !field.equalsIgnoreCase("rowkey")) + .map(field => s"$cf:$field") + .mkString(" ")) + val fields = hbaseConfig.fields + + val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = session.sparkContext.newAPIHadoopRDD( + scanConf, + classOf[TableInputFormat], + classOf[ImmutableBytesWritable], + classOf[Result]) + + val rowRDD = hbaseRDD.map(row => { + val values: ListBuffer[String] = new ListBuffer[String] + val result: Result = row._2 + + for (i <- fields.indices) { + if (fields(i).equalsIgnoreCase("rowkey")) { + values += Bytes.toString(result.getRow) + } else { + values += Bytes.toString(result.getValue(Bytes.toBytes(cf), Bytes.toBytes(fields(i)))) + } + } + Row.fromSeq(values.toList) + }) + val schema = StructType( + fields.map(field => DataTypes.createStructField(field, DataTypes.StringType, true))) + val dataFrame = session.createDataFrame(rowRDD, schema) + dataFrame + } +} + +/** + * MaxCompute Reader + */ +class MaxcomputeReader(override val session: SparkSession, maxComputeConfig: MaxComputeConfigEntry) + extends ServerBaseReader(session, maxComputeConfig.sentence) { + + override def read(): DataFrame = { + var dfReader = session.read + .format("org.apache.spark.aliyun.odps.datasource") + .option("odpsUrl", maxComputeConfig.odpsUrl) + .option("tunnelUrl", maxComputeConfig.tunnelUrl) + .option("table", maxComputeConfig.table) + .option("project", maxComputeConfig.project) + .option("accessKeyId", maxComputeConfig.accessKeyId) + .option("accessKeySecret", maxComputeConfig.accessKeySecret) + .option("numPartitions", maxComputeConfig.numPartitions) + + // if use partition read + if (maxComputeConfig.partitionSpec != null) { + dfReader = dfReader.option("partitionSpec", maxComputeConfig.partitionSpec) + } + + val df = dfReader.load() + import session._ + if (maxComputeConfig.sentence == null) { + df + } else { + df.createOrReplaceTempView(s"${maxComputeConfig.table}") + session.sql(maxComputeConfig.sentence) + } + } +} + +/** + * Clickhouse reader + */ +class ClickhouseReader(override val session: SparkSession, + clickHouseConfigEntry: ClickHouseConfigEntry) + extends ServerBaseReader(session, clickHouseConfigEntry.sentence) { + Class.forName("ru.yandex.clickhouse.ClickHouseDriver") + override def read(): DataFrame = { + val df = session.read + .format("jdbc") + .option("driver", "ru.yandex.clickhouse.ClickHouseDriver") + .option("url", clickHouseConfigEntry.url) + .option("user", clickHouseConfigEntry.user) + .option("password", clickHouseConfigEntry.passwd) + .option("numPartitions", clickHouseConfigEntry.numPartition) + .option("dbtable", clickHouseConfigEntry.table) + .option("query", clickHouseConfigEntry.sentence) + .load() + df + } +} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala similarity index 85% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala rename to nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala index 25c8fd50..a3640698 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala +++ b/nebula-exchange_spark_2.2/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala @@ -5,7 +5,7 @@ package com.vesoft.nebula.exchange.reader -import com.vesoft.nebula.exchange.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} +import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{DataFrame, SparkSession} @@ -32,7 +32,8 @@ class KafkaReader(override val session: SparkSession, targetFields: List[String]) extends StreamingBaseReader(session) { - require(kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) + require( + kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) override def read(): DataFrame = { import org.apache.spark.sql.functions._ @@ -46,9 +47,11 @@ class KafkaReader(override val session: SparkSession, .option("startingOffsets", kafkaConfig.startingOffsets) val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger - if(maxOffsetsPerTrigger.isDefined) reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) + if (maxOffsetsPerTrigger.isDefined) + reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) - reader.load() + reader + .load() .select($"value".cast(StringType)) .select(json_tuple($"value", fields: _*)) .toDF(fields: _*) diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala b/nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala similarity index 96% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala rename to nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala index a72ca16a..f4431873 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala +++ b/nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala @@ -7,13 +7,13 @@ package com.vesoft.nebula.exchange.processor import java.io.File +import com.vesoft.exchange.common.VidType import com.vesoft.nebula.PropertyType -import com.vesoft.nebula.exchange.{KeyPolicy, VidType} -import com.vesoft.nebula.exchange.config.{Configs, EdgeConfigEntry, TagConfigEntry} -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE -import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, EdgeItem, Schema, SchemaProp, TagItem} +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, EdgeConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, EdgeItem, Schema, SchemaProp} import org.apache.commons.codec.binary.Hex -import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ BooleanType, @@ -25,13 +25,15 @@ import org.apache.spark.sql.types.{ StructField, StructType } +import org.apache.spark.sql.{DataFrame, Row} import org.junit.Test import org.scalatest.Assertions.assertThrows import scala.collection.JavaConverters._ class EdgeProcessorSuite { - val config: Configs = Configs.parse(new File("src/test/resources/process_application.conf")) + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) var data: DataFrame = null var edgeConfig: EdgeConfigEntry = config.edgesConfig.head diff --git a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala b/nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala similarity index 95% rename from nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala rename to nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala index 5c8e27b8..2340df94 100644 --- a/nebula-exchange/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala +++ b/nebula-exchange_spark_2.2/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala @@ -7,10 +7,11 @@ package com.vesoft.nebula.exchange.processor import java.io.File +import com.vesoft.exchange.common.VidType import com.vesoft.nebula.PropertyType -import com.vesoft.nebula.exchange.{KeyPolicy, VidType} -import com.vesoft.nebula.exchange.config.{Configs, TagConfigEntry} -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, TagConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, Schema, SchemaProp, TagItem} import org.apache.commons.codec.binary.Hex import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -24,14 +25,15 @@ import org.apache.spark.sql.types.{ StructField, StructType } -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.{DataFrame, Row} import org.junit.Test import org.scalatest.Assertions.assertThrows import scala.collection.JavaConverters._ class VerticesProcessorSuite { - val config: Configs = Configs.parse(new File("src/test/resources/process_application.conf")) + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) var data: DataFrame = null var tagConfig: TagConfigEntry = config.tagsConfig.head diff --git a/nebula-exchange/.gitignore b/nebula-exchange_spark_2.4/.gitignore similarity index 100% rename from nebula-exchange/.gitignore rename to nebula-exchange_spark_2.4/.gitignore diff --git a/nebula-exchange/pom.xml b/nebula-exchange_spark_2.4/pom.xml similarity index 68% rename from nebula-exchange/pom.xml rename to nebula-exchange_spark_2.4/pom.xml index ae432333..7e628fa3 100644 --- a/nebula-exchange/pom.xml +++ b/nebula-exchange_spark_2.4/pom.xml @@ -3,41 +3,26 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> - <artifactId>nebula-spark</artifactId> + <artifactId>exchange</artifactId> <groupId>com.vesoft</groupId> <version>2.5-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>nebula-exchange</artifactId> + <artifactId>nebula-exchange_spark_2.4</artifactId> <properties> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> - <scala.2.10.version>2.10.7</scala.2.10.version> - <scala.2.11.version>2.11.12</scala.2.11.version> - <scala.2.12.version>2.12.10</scala.2.12.version> - <scala.version>${scala.2.11.version}</scala.version> - <version.scala.binary>2.12</version.scala.binary> + <scala.version>2.11.12</scala.version> <spark.version>2.4.4</spark.version> <spark-csv.version>1.5.0</spark-csv.version> - <scalatest.version>3.2.0</scalatest.version> - <rocksdb.version>6.7.3</rocksdb.version> - <config.version>1.4.0</config.version> <scala-logging.version>3.9.2</scala-logging.version> - <scala-xml.version>2.11.0-M4</scala-xml.version> - <scopt.version>3.7.1</scopt.version> - <nebula.version>2.0.0-SNAPSHOT</nebula.version> - <s2.version>1.0.0</s2.version> <neo.version>2.4.5-M1</neo.version> <gremlin.version>3.4.6</gremlin.version> <janusgraph.version>0.5.0</janusgraph.version> - <pulsar.version>2.4.5</pulsar.version> - <commons-codec.version>1.14</commons-codec.version> - <hadoop.version>2.6.1</hadoop.version> - <hbase.version>1.2.0</hbase.version> - <kafka.version>2.0.0</kafka.version> + <pulsar.version>3.1.1.1</pulsar.version> </properties> <build> @@ -76,7 +61,6 @@ </goals> <configuration> <excludes> - <exclude>com/vesoft/tools/**</exclude> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> @@ -88,17 +72,13 @@ <goals> <goal>testCompile</goal> </goals> - <configuration> - <excludes> - com/vesoft/tools/** - </excludes> - </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> + <version>2.12.4</version> <configuration> <includes> <include>**/*Test.*</include> @@ -150,7 +130,6 @@ <filter> <artifact>*:*</artifact> <excludes> - <exclude>com/vesoft/tools/**</exclude> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> @@ -177,7 +156,7 @@ <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> - <!-- <version>4.4.0</version> --> + <version>4.4.0</version> <executions> <execution> <id>Scaladoc</id> @@ -204,6 +183,12 @@ </args> </configuration> </execution> + <execution> + <id>scala-compile-first</id> + <goals> + <goal>compile</goal> + </goals> + </execution> </executions> </plugin> <plugin> @@ -268,149 +253,10 @@ </dependency> <dependency> <groupId>io.streamnative.connectors</groupId> - <artifactId>pulsar-spark-connector_2.11</artifactId> + <artifactId>pulsar-spark-connector_2.12</artifactId> <version>${pulsar.version}</version> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-core_2.11</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - <exclusions> - <exclusion> - <artifactId>snappy-java</artifactId> - <groupId>org.xerial.snappy</groupId> - </exclusion> - <exclusion> - <artifactId>paranamer</artifactId> - <groupId>com.thoughtworks.paranamer</groupId> - </exclusion> - <exclusion> - <artifactId>slf4j-api</artifactId> - <groupId>org.slf4j</groupId> - </exclusion> - <exclusion> - <artifactId>commons-codec</artifactId> - <groupId>commons-codec</groupId> - </exclusion> - <exclusion> - <artifactId>avro</artifactId> - <groupId>org.apache.avro</groupId> - </exclusion> - <exclusion> - <artifactId>commons-lang</artifactId> - <groupId>commons-lang</groupId> - </exclusion> - <exclusion> - <artifactId>commons-collections</artifactId> - <groupId>commons-collections</groupId> - </exclusion> - <exclusion> - <artifactId>commons-compress</artifactId> - <groupId>org.apache.commons</groupId> - </exclusion> - <exclusion> - <artifactId>commons-math3</artifactId> - <groupId>org.apache.commons</groupId> - </exclusion> - <exclusion> - <artifactId>guava</artifactId> - <groupId>com.google.guava</groupId> - </exclusion> - <exclusion> - <artifactId>httpclient</artifactId> - <groupId>org.apache.httpcomponents</groupId> - </exclusion> - <exclusion> - <artifactId>slf4j-log4j12</artifactId> - <groupId>org.slf4j</groupId> - </exclusion> - <exclusion> - <artifactId>netty</artifactId> - <groupId>io.netty</groupId> - </exclusion> - <exclusion> - <artifactId>jackson-annotations</artifactId> - <groupId>com.fasterxml.jackson.core</groupId> - </exclusion> - <exclusion> - <artifactId>scala-reflect</artifactId> - <groupId>org.scala-lang</groupId> - </exclusion> - <exclusion> - <artifactId>scala-library</artifactId> - <groupId>org.scala-lang</groupId> - </exclusion> - <exclusion> - <artifactId>jackson-databind</artifactId> - <groupId>com.fasterxml.jackson.core</groupId> - </exclusion> - <exclusion> - <artifactId>scala-xml_2.11</artifactId> - <groupId>org.scala-lang.modules</groupId> - </exclusion> - <exclusion> - <artifactId>log4j</artifactId> - <groupId>log4j</groupId> - </exclusion> - <exclusion> - <artifactId>javassist</artifactId> - <groupId>org.javassist</groupId> - </exclusion> - <exclusion> - <artifactId>commons-io</artifactId> - <groupId>commons-io</groupId> - </exclusion> - <exclusion> - <artifactId>commons-configuration</artifactId> - <groupId>commons-configuration</groupId> - </exclusion> - <exclusion> - <artifactId>jul-to-slf4j</artifactId> - <groupId>org.slf4j</groupId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-sql_2.11</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - <exclusions> - <exclusion> - <artifactId>snappy-java</artifactId> - <groupId>org.xerial.snappy</groupId> - </exclusion> - <exclusion> - <artifactId>jsr305</artifactId> - <groupId>com.google.code.findbugs</groupId> - </exclusion> - <exclusion> - <artifactId>slf4j-api</artifactId> - <groupId>org.slf4j</groupId> - </exclusion> - <exclusion> - <artifactId>jackson-core</artifactId> - <groupId>com.fasterxml.jackson.core</groupId> - </exclusion> - <exclusion> - <artifactId>joda-time</artifactId> - <groupId>joda-time</groupId> - </exclusion> - <exclusion> - <artifactId>commons-codec</artifactId> - <groupId>commons-codec</groupId> - </exclusion> - <exclusion> - <artifactId>snappy-java</artifactId> - <groupId>org.xerial.snappy</groupId> - </exclusion> - <exclusion> - <artifactId>hppc</artifactId> - <groupId>com.carrotsearch</groupId> - </exclusion> - </exclusions> - </dependency> + <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-catalyst_2.11</artifactId> @@ -530,16 +376,6 @@ <version>${scalatest.version}</version> <scope>test</scope> </dependency> - <dependency> - <groupId>org.rocksdb</groupId> - <artifactId>rocksdbjni</artifactId> - <version>${rocksdb.version}</version> - </dependency> - <dependency> - <groupId>com.typesafe</groupId> - <artifactId>config</artifactId> - <version>${config.version}</version> - </dependency> <dependency> <groupId>com.typesafe.scala-logging</groupId> <artifactId>scala-logging_2.11</artifactId> @@ -559,31 +395,6 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>com.github.scopt</groupId> - <artifactId>scopt_2.11</artifactId> - <version>${scopt.version}</version> - </dependency> - <dependency> - <groupId>io.sgr</groupId> - <artifactId>s2-geometry-library-java</artifactId> - <version>${s2.version}</version> - </dependency> - <dependency> - <groupId>com.vesoft</groupId> - <artifactId>client</artifactId> - <version>${nebula.version}</version> - <exclusions> - <exclusion> - <artifactId>commons-lang3</artifactId> - <groupId>org.apache.commons</groupId> - </exclusion> - <exclusion> - <artifactId>commons-codec</artifactId> - <groupId>commons-codec</groupId> - </exclusion> - </exclusions> - </dependency> <dependency> <groupId>neo4j-contrib</groupId> <artifactId>neo4j-spark-connector</artifactId> @@ -712,36 +523,6 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - <version>${commons-codec.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <version>${hadoop.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase-client</artifactId> - <version>${hbase.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase-common</artifactId> - <version>${hbase.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase-server</artifactId> - <version>${hbase.version}</version> - </dependency> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-xml</artifactId> - <version>${scala-xml.version}</version> - </dependency> <dependency> <groupId>com.aliyun.odps</groupId> <artifactId>odps-spark-datasource_2.11</artifactId> @@ -753,14 +534,9 @@ <version>2.2.0</version> </dependency> <dependency> - <groupId>ru.yandex.clickhouse</groupId> - <artifactId>clickhouse-jdbc</artifactId> - <version>0.2.5</version> - </dependency> - <dependency> - <groupId>org.locationtech.jts</groupId> - <artifactId>jts-core</artifactId> - <version>1.16.1</version> + <groupId>com.vesoft</groupId> + <artifactId>exchange-common</artifactId> + <version>${project.version}</version> </dependency> </dependencies> <repositories> diff --git a/nebula-exchange/src/main/resources/application.conf b/nebula-exchange_spark_2.4/src/main/resources/application.conf similarity index 93% rename from nebula-exchange/src/main/resources/application.conf rename to nebula-exchange_spark_2.4/src/main/resources/application.conf index 5a2f3624..6da798b6 100644 --- a/nebula-exchange/src/main/resources/application.conf +++ b/nebula-exchange_spark_2.4/src/main/resources/application.conf @@ -1,5 +1,5 @@ { - # Spark relation config + # Spark relation com.vesoft.exchange.common.config spark: { app: { name: Nebula Exchange 2.0 @@ -22,7 +22,7 @@ } # if the hive is hive-on-spark with derby mode, you can ignore this hive configure - # get the config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml + # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml # hive: { # warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" @@ -33,7 +33,7 @@ # } - # Nebula Graph relation config + # Nebula Graph relation com.vesoft.exchange.common.config nebula: { address:{ graph:["127.0.0.1:9669"] @@ -43,7 +43,7 @@ pswd: nebula space: test - # if config graph ssl encrypted transmission + # if com.vesoft.exchange.common.config graph ssl encrypted transmission ssl:{ # if enable is false, other params of ssl are invalid. enable:{ @@ -53,14 +53,14 @@ # ssl sign type: CA or SELF sign.type:ca - # if sign.type is CA, make sure config the ca.param. If you submit exchange application with cluster, please make sure each worker has the ca files. + # if sign.type is CA, make sure com.vesoft.exchange.common.config the ca.param. If you submit exchange application with cluster, please make sure each worker has the ca files. ca.param: { caCrtFilePath:"/path/caCrtFilePath" crtFilePath:"/path/crtFilePath" keyFilePath:"/path/keyFilePath" } - # if sign.type is SELF, make sure config the self.param. If you submit exchange application with cluster, please make sure each worker has the ca files. + # if sign.type is SELF, make sure com.vesoft.exchange.common.config the self.param. If you submit exchange application with cluster, please make sure each worker has the ca files. self.param: { crtFilePath:"/path/crtFilePath" keyFilePath:"/path/keyFilePath" @@ -100,7 +100,7 @@ } # Processing tags - # There are tag config examples for different dataSources. + # There are tag com.vesoft.exchange.common.config examples for different dataSources. tags: [ # HDFS parquet @@ -302,6 +302,7 @@ user:"user" password:"clickhouse" numPartition:"5" + table:"table" sentence:"select * from table" fields: [clickhouse-field-0, clickhouse-field-1, clickhouse-field-2] nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] @@ -315,7 +316,7 @@ ] # Processing edges - # There are edge config examples for different dataSources. + # There are edge com.vesoft.exchange.common.config examples for different dataSources. edges: [ # HDFS parquet # Import mode is client, just change type.sink to sst if you want to use sst import mode. @@ -535,6 +536,7 @@ user:"user" password:"clickhouse" numPartition:"5" + table:"table" sentence:"select * from table" fields: [clickhouse-field-2] nebula.fields: [nebula-field-2] diff --git a/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala new file mode 100644 index 00000000..3c8a4653 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala @@ -0,0 +1,330 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange + +import org.apache.spark.sql.{DataFrame, SparkSession} +import java.io.File + +import com.vesoft.exchange.Argument +import com.vesoft.exchange.common.{CheckPointHandler, ErrorHandler} +import com.vesoft.exchange.common.config.{ + ClickHouseConfigEntry, + Configs, + DataSourceConfigEntry, + FileBaseSourceConfigEntry, + HBaseSourceConfigEntry, + HiveSourceConfigEntry, + JanusGraphSourceConfigEntry, + KafkaSourceConfigEntry, + MaxComputeConfigEntry, + MySQLSourceConfigEntry, + Neo4JSourceConfigEntry, + PulsarSourceConfigEntry, + SinkCategory, + SourceCategory +} +import com.vesoft.nebula.exchange.reader.{ + CSVReader, + ClickhouseReader, + HBaseReader, + HiveReader, + JSONReader, + JanusGraphReader, + KafkaReader, + MaxcomputeReader, + MySQLReader, + Neo4JReader, + ORCReader, + ParquetReader, + PulsarReader +} +import com.vesoft.exchange.common.processor.ReloadProcessor +import com.vesoft.nebula.exchange.processor.{EdgeProcessor, VerticesProcessor} +import org.apache.log4j.Logger +import org.apache.spark.SparkConf + +final case class TooManyErrorsException(private val message: String) extends Exception(message) + +/** + * SparkClientGenerator is a simple spark job used to write data into Nebula Graph parallel. + */ +object Exchange { + private[this] val LOG = Logger.getLogger(this.getClass) + + def main(args: Array[String]): Unit = { + val PROGRAM_NAME = "Nebula Graph Exchange" + val options = Configs.parser(args, PROGRAM_NAME) + val c: Argument = options match { + case Some(config) => config + case _ => + LOG.error("Argument parse failed") + sys.exit(-1) + } + + val configs = Configs.parse(new File(c.config)) + LOG.info(s"Config ${configs}") + + val session = SparkSession + .builder() + .appName(PROGRAM_NAME) + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + + for (key <- configs.sparkConfigEntry.map.keySet) { + session.config(key, configs.sparkConfigEntry.map(key)) + } + + val sparkConf = new SparkConf() + sparkConf.registerKryoClasses(Array(classOf[com.facebook.thrift.async.TAsyncClientManager])) + + // com.vesoft.exchange.common.config hive for sparkSession + if (c.hive) { + if (configs.hiveConfigEntry.isEmpty) { + LOG.info( + "you don't com.vesoft.exchange.common.config hive source, so using hive tied with spark.") + } else { + val hiveConfig = configs.hiveConfigEntry.get + sparkConf.set("spark.sql.warehouse.dir", hiveConfig.warehouse) + sparkConf + .set("javax.jdo.option.ConnectionURL", hiveConfig.connectionURL) + .set("javax.jdo.option.ConnectionDriverName", hiveConfig.connectionDriverName) + .set("javax.jdo.option.ConnectionUserName", hiveConfig.connectionUserName) + .set("javax.jdo.option.ConnectionPassword", hiveConfig.connectionPassWord) + } + } + + session.config(sparkConf) + + if (c.hive) { + session.enableHiveSupport() + } + + val spark = session.getOrCreate() + + // reload for failed import tasks + if (!c.reload.isEmpty) { + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.reload") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.reload") + + val data = spark.read.text(c.reload) + val processor = new ReloadProcessor(data, configs, batchSuccess, batchFailure) + processor.process() + LOG.info(s"batchSuccess.reload: ${batchSuccess.value}") + LOG.info(s"batchFailure.reload: ${batchFailure.value}") + sys.exit(0) + } + + // record the failed batch number + var failures: Long = 0L + + // import tags + if (configs.tagsConfig.nonEmpty) { + for (tagConfig <- configs.tagsConfig) { + LOG.info(s"Processing Tag ${tagConfig.name}") + spark.sparkContext.setJobGroup(tagConfig.name, s"Tag: ${tagConfig.name}") + + val fieldKeys = tagConfig.fields + LOG.info(s"field keys: ${fieldKeys.mkString(", ")}") + val nebulaKeys = tagConfig.nebulaFields + LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") + + val fields = tagConfig.vertexField :: tagConfig.fields + val data = createDataSource(spark, tagConfig.dataSourceConfigEntry, fields) + if (data.isDefined && !c.dry) { + val startTime = System.currentTimeMillis() + val batchSuccess = + spark.sparkContext.longAccumulator(s"batchSuccess.${tagConfig.name}") + val batchFailure = + spark.sparkContext.longAccumulator(s"batchFailure.${tagConfig.name}") + + val processor = new VerticesProcessor( + repartition(data.get, tagConfig.partition, tagConfig.dataSourceConfigEntry.category), + tagConfig, + fieldKeys, + nebulaKeys, + configs, + batchSuccess, + batchFailure) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"import for tag ${tagConfig.name} cost time: ${costTime} s") + if (tagConfig.dataSinkConfigEntry.category == SinkCategory.CLIENT) { + LOG.info(s"Client-Import: batchSuccess.${tagConfig.name}: ${batchSuccess.value}") + LOG.info(s"Client-Import: batchFailure.${tagConfig.name}: ${batchFailure.value}") + failures += batchFailure.value + } else { + LOG.info(s"SST-Import: failure.${tagConfig.name}: ${batchFailure.value}") + } + } + } + } else { + LOG.warn("Tag is not defined") + } + + // import edges + if (configs.edgesConfig.nonEmpty) { + for (edgeConfig <- configs.edgesConfig) { + LOG.info(s"Processing Edge ${edgeConfig.name}") + spark.sparkContext.setJobGroup(edgeConfig.name, s"Edge: ${edgeConfig.name}") + + val fieldKeys = edgeConfig.fields + LOG.info(s"field keys: ${fieldKeys.mkString(", ")}") + val nebulaKeys = edgeConfig.nebulaFields + LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") + val fields = if (edgeConfig.rankingField.isDefined) { + edgeConfig.rankingField.get :: edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields + } else { + edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields + } + val data = createDataSource(spark, edgeConfig.dataSourceConfigEntry, fields) + if (data.isDefined && !c.dry) { + val startTime = System.currentTimeMillis() + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.${edgeConfig.name}") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.${edgeConfig.name}") + + val processor = new EdgeProcessor( + repartition(data.get, edgeConfig.partition, edgeConfig.dataSourceConfigEntry.category), + edgeConfig, + fieldKeys, + nebulaKeys, + configs, + batchSuccess, + batchFailure + ) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"import for edge ${edgeConfig.name} cost time: ${costTime} s") + if (edgeConfig.dataSinkConfigEntry.category == SinkCategory.CLIENT) { + LOG.info(s"Client-Import: batchSuccess.${edgeConfig.name}: ${batchSuccess.value}") + LOG.info(s"Client-Import: batchFailure.${edgeConfig.name}: ${batchFailure.value}") + failures += batchFailure.value + } else { + LOG.info(s"SST-Import: failure.${edgeConfig.name}: ${batchFailure.value}") + } + } + } + } else { + LOG.warn("Edge is not defined") + } + + // reimport for failed tags and edges + if (failures > 0 && ErrorHandler.existError(configs.errorConfig.errorPath)) { + spark.sparkContext.setJobGroup("Reload", s"Reload: ${configs.errorConfig.errorPath}") + + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.reimport") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.reimport") + val data = spark.read.text(configs.errorConfig.errorPath) + val startTime = System.currentTimeMillis() + val processor = new ReloadProcessor(data, configs, batchSuccess, batchFailure) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"reimport ngql cost time: ${costTime}") + LOG.info(s"batchSuccess.reimport: ${batchSuccess.value}") + LOG.info(s"batchFailure.reimport: ${batchFailure.value}") + } + spark.close() + } + + /** + * Create data source for different data type. + * + * @param session The Spark Session. + * @param config The com.vesoft.exchange.common.config. + * @return + */ + private[this] def createDataSource( + session: SparkSession, + config: DataSourceConfigEntry, + fields: List[String] + ): Option[DataFrame] = { + config.category match { + case SourceCategory.PARQUET => + val parquetConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading Parquet files from ${parquetConfig.path}""") + val reader = new ParquetReader(session, parquetConfig) + Some(reader.read()) + case SourceCategory.ORC => + val orcConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading ORC files from ${orcConfig.path}""") + val reader = new ORCReader(session, orcConfig) + Some(reader.read()) + case SourceCategory.JSON => + val jsonConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading JSON files from ${jsonConfig.path}""") + val reader = new JSONReader(session, jsonConfig) + Some(reader.read()) + case SourceCategory.CSV => + val csvConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading CSV files from ${csvConfig.path}""") + val reader = + new CSVReader(session, csvConfig) + Some(reader.read()) + case SourceCategory.HIVE => + val hiveConfig = config.asInstanceOf[HiveSourceConfigEntry] + LOG.info(s"""Loading from Hive and exec ${hiveConfig.sentence}""") + val reader = new HiveReader(session, hiveConfig) + Some(reader.read()) + case SourceCategory.KAFKA => { + val kafkaConfig = config.asInstanceOf[KafkaSourceConfigEntry] + LOG.info(s"""Loading from Kafka ${kafkaConfig.server} and subscribe ${kafkaConfig.topic}""") + val reader = new KafkaReader(session, kafkaConfig, fields) + Some(reader.read()) + } + case SourceCategory.NEO4J => + val neo4jConfig = config.asInstanceOf[Neo4JSourceConfigEntry] + LOG.info(s"Loading from neo4j com.vesoft.exchange.common.config: ${neo4jConfig}") + val reader = new Neo4JReader(session, neo4jConfig) + Some(reader.read()) + case SourceCategory.MYSQL => + val mysqlConfig = config.asInstanceOf[MySQLSourceConfigEntry] + LOG.info(s"Loading from mysql com.vesoft.exchange.common.config: ${mysqlConfig}") + val reader = new MySQLReader(session, mysqlConfig) + Some(reader.read()) + case SourceCategory.PULSAR => + val pulsarConfig = config.asInstanceOf[PulsarSourceConfigEntry] + LOG.info(s"Loading from pulsar com.vesoft.exchange.common.config: ${pulsarConfig}") + val reader = new PulsarReader(session, pulsarConfig) + Some(reader.read()) + case SourceCategory.JANUS_GRAPH => + val janusGraphSourceConfigEntry = config.asInstanceOf[JanusGraphSourceConfigEntry] + val reader = new JanusGraphReader(session, janusGraphSourceConfigEntry) + Some(reader.read()) + case SourceCategory.HBASE => + val hbaseSourceConfigEntry = config.asInstanceOf[HBaseSourceConfigEntry] + val reader = new HBaseReader(session, hbaseSourceConfigEntry) + Some(reader.read()) + case SourceCategory.MAXCOMPUTE => + val maxComputeConfigEntry = config.asInstanceOf[MaxComputeConfigEntry] + val reader = new MaxcomputeReader(session, maxComputeConfigEntry) + Some(reader.read()) + case SourceCategory.CLICKHOUSE => { + val clickhouseConfigEntry = config.asInstanceOf[ClickHouseConfigEntry] + val reader = new ClickhouseReader(session, clickhouseConfigEntry) + Some(reader.read()) + } + case _ => { + LOG.error(s"Data source ${config.category} not supported") + None + } + } + } + + /** + * Repartition the data frame using the specified partition number. + * + * @param frame + * @param partition + * @return + */ + private[this] def repartition(frame: DataFrame, + partition: Int, + sourceCategory: SourceCategory.Value): DataFrame = { + if (partition > 0 && !CheckPointHandler.checkSupportResume(sourceCategory)) { + frame.repartition(partition).toDF + } else { + frame + } + } +} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala similarity index 95% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala rename to nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala index bad1a8b7..fe12faca 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala @@ -5,41 +5,34 @@ package com.vesoft.nebula.exchange.processor -import java.nio.{ByteBuffer, ByteOrder} -import java.nio.file.{Files, Paths} +import java.nio.ByteOrder import com.google.common.geometry.{S2CellId, S2LatLng} -import com.vesoft.nebula.client.graph.data.HostAddress -import com.vesoft.nebula.encoder.NebulaCodecImpl -import com.vesoft.nebula.exchange.config.{ +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{Edge, Edges, KeyPolicy} +import com.vesoft.exchange.common.config.{ Configs, EdgeConfigEntry, FileBaseSinkConfigEntry, SinkCategory, StreamingDataSourceConfigEntry } -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE -import com.vesoft.nebula.exchange.utils.{HDFSUtils, NebulaUtils} -import com.vesoft.nebula.exchange.{ - Edge, - Edges, - ErrorHandler, - GraphProvider, - KeyPolicy, - MetaProvider, - VidType -} -import org.apache.log4j.Logger -import com.vesoft.nebula.exchange.writer.{NebulaGraphClientWriter, NebulaSSTWriter} -import com.vesoft.nebula.meta.{EdgeItem, TagItem} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl +import com.vesoft.nebula.meta.EdgeItem import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.log4j.Logger import org.apache.spark.TaskContext import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.{DataFrame, Encoders, Row} import org.apache.spark.util.LongAccumulator import scala.collection.JavaConverters._ -import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.collection.mutable.ArrayBuffer class EdgeProcessor(data: DataFrame, edgeConfig: EdgeConfigEntry, @@ -322,11 +315,15 @@ class EdgeProcessor(data: DataFrame, val srcIndex: Int = row.schema.fieldIndex(edgeConfig.sourceField) var srcId: String = row.get(srcIndex).toString - if (srcId.equals(DEFAULT_EMPTY_VALUE)) { srcId = "" } + if (srcId.equals(DEFAULT_EMPTY_VALUE)) { + srcId = "" + } val dstIndex: Int = row.schema.fieldIndex(edgeConfig.targetField) var dstId: String = row.get(dstIndex).toString - if (dstId.equals(DEFAULT_EMPTY_VALUE)) { dstId = "" } + if (dstId.equals(DEFAULT_EMPTY_VALUE)) { + dstId = "" + } if (edgeConfig.sourcePolicy.isDefined) { edgeConfig.sourcePolicy.get match { diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala similarity index 93% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala rename to nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala index 6f82b26a..1794c36a 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala @@ -5,31 +5,23 @@ package com.vesoft.nebula.exchange.processor -import java.nio.file.{Files, Paths} -import java.nio.{ByteBuffer, ByteOrder} +import java.nio.ByteOrder -import com.google.common.net.HostAndPort -import com.vesoft.nebula.client.graph.data.HostAddress -import com.vesoft.nebula.encoder.NebulaCodecImpl -import com.vesoft.nebula.exchange.{ - ErrorHandler, - GraphProvider, - KeyPolicy, - MetaProvider, - Vertex, - Vertices, - VidType -} -import com.vesoft.nebula.exchange.config.{ +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{KeyPolicy, Vertex, Vertices} +import com.vesoft.exchange.common.config.{ Configs, FileBaseSinkConfigEntry, SinkCategory, StreamingDataSourceConfigEntry, TagConfigEntry } -import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE -import com.vesoft.nebula.exchange.utils.{HDFSUtils, NebulaUtils} -import com.vesoft.nebula.exchange.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl import com.vesoft.nebula.meta.TagItem import org.apache.commons.codec.digest.MurmurHash2 import org.apache.log4j.Logger @@ -37,10 +29,9 @@ import org.apache.spark.TaskContext import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.{DataFrame, Encoders, Row} import org.apache.spark.util.LongAccumulator -import org.rocksdb.SstFileWriter import scala.collection.JavaConverters._ -import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.collection.mutable.ArrayBuffer /** * diff --git a/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala new file mode 100644 index 00000000..2cd7e476 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala @@ -0,0 +1,115 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +/** + * The FileBaseReader is the abstract class for HDFS file reader. + * + * @param session + * @param path + */ +abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { + + require(path.trim.nonEmpty) + + override def close(): Unit = { + session.close() + } +} + +/** + * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. + * + * @param session + * @param parquetConfig + */ +class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, parquetConfig.path) { + + override def read(): DataFrame = { + session.read.parquet(path) + } +} + +/** + * The ORCReader extend the FileBaseReader and support read orc file from HDFS. + * + * @param session + * @param orcConfig + */ +class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, orcConfig.path) { + + override def read(): DataFrame = { + session.read.orc(path) + } +} + +/** + * The JSONReader extend the FileBaseReader and support read json file from HDFS. + * + * @param session + * @param jsonConfig + */ +class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, jsonConfig.path) { + + override def read(): DataFrame = { + session.read.json(path) + } +} + +/** + * The CSVReader extend the FileBaseReader and support read csv file from HDFS. + * All types of the structure are StringType. + * + * @param session + * @param csvConfig + */ +class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, csvConfig.path) { + + override def read(): DataFrame = { + session.read + .option("delimiter", csvConfig.separator.get) + .option("header", csvConfig.header.get) + .option("emptyValue", DEFAULT_EMPTY_VALUE) + .csv(path) + } +} + +/** + * The CustomReader extend the FileBaseReader and support read text file from HDFS. + * Transformation is a function convert a line into Row. + * The structure of the row should be specified. + * + * @param session + * @param customConfig + * @param transformation + * @param structType + */ +abstract class CustomReader(override val session: SparkSession, + customConfig: FileBaseSourceConfigEntry, + transformation: String => Row, + filter: Row => Boolean, + structType: StructType) + extends FileBaseReader(session, customConfig.path) { + + override def read(): DataFrame = { + val encoder = RowEncoder.apply(structType) + session.read + .text(path) + .filter(!_.getString(0).isEmpty) + .map(row => transformation(row.getString(0)))(encoder) + .filter(filter) + } +} diff --git a/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala new file mode 100644 index 00000000..fb8455e6 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala @@ -0,0 +1,63 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.Offset +import com.vesoft.exchange.common.utils.HDFSUtils +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. + */ +trait Reader extends Serializable { + def session: SparkSession + + def read(): DataFrame + + def close(): Unit +} + +trait CheckPointSupport extends Serializable { + + def getOffsets(totalCount: Long, + parallel: Int, + checkPointPath: Option[String], + checkPointNamePrefix: String): List[Offset] = { + if (totalCount <= 0) + throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") + + val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List + .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) + + val startOffsets = batchSizes.scanLeft(0L)(_ + _).init + + val checkPointOffsets = checkPointPath match { + case Some(path) => + val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList + if (files.forall(HDFSUtils.exists)) + files.map(HDFSUtils.getContent(_).trim.toLong).sorted + else startOffsets + case _ => startOffsets + } + + if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) + throw new RuntimeException( + s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") + + val eachPartitionLimit = { + batchSizes + .zip(startOffsets.zip(checkPointOffsets)) + .map(x => { + x._1 - (x._2._2 - x._2._1) + }) + } + val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) + if (offsets.exists(_.size < 0L)) + throw new RuntimeException( + s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") + offsets + } +} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala similarity index 98% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala rename to nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala index c32ae00b..b6fcd9e6 100644 --- a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala @@ -6,7 +6,7 @@ package com.vesoft.nebula.exchange.reader import com.google.common.collect.Maps -import com.vesoft.nebula.exchange.config.{ +import com.vesoft.exchange.common.config.{ ClickHouseConfigEntry, HBaseSourceConfigEntry, HiveSourceConfigEntry, @@ -16,7 +16,8 @@ import com.vesoft.nebula.exchange.config.{ Neo4JSourceConfigEntry, ServerDataSourceConfigEntry } -import com.vesoft.nebula.exchange.utils.{HDFSUtils, Neo4jUtils} +import com.vesoft.exchange.common.utils.HDFSUtils +import com.vesoft.nebula.exchange.utils.Neo4jUtils import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable diff --git a/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala new file mode 100644 index 00000000..a3640698 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala @@ -0,0 +1,78 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} +import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * Spark Streaming + * + * @param session + */ +abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { + + override def close(): Unit = { + session.close() + } +} + +/** + * + * @param session + * @param kafkaConfig + * @param targetFields + */ +class KafkaReader(override val session: SparkSession, + kafkaConfig: KafkaSourceConfigEntry, + targetFields: List[String]) + extends StreamingBaseReader(session) { + + require( + kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) + + override def read(): DataFrame = { + import org.apache.spark.sql.functions._ + import session.implicits._ + val fields = targetFields.distinct + val reader = + session.readStream + .format("kafka") + .option("kafka.bootstrap.servers", kafkaConfig.server) + .option("subscribe", kafkaConfig.topic) + .option("startingOffsets", kafkaConfig.startingOffsets) + + val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger + if (maxOffsetsPerTrigger.isDefined) + reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) + + reader + .load() + .select($"value".cast(StringType)) + .select(json_tuple($"value", fields: _*)) + .toDF(fields: _*) + + } +} + +/** + * + * @param session + * @param pulsarConfig + */ +class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) + extends StreamingBaseReader(session) { + + override def read(): DataFrame = { + session.readStream + .format("pulsar") + .option("service.url", pulsarConfig.serviceUrl) + .option("admin.url", pulsarConfig.adminUrl) + .options(pulsarConfig.options) + .load() + } +} diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala b/nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala similarity index 100% rename from nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala rename to nebula-exchange_spark_2.4/src/main/scala/com/vesoft/nebula/exchange/utils/Neo4jUtils.scala diff --git a/nebula-exchange_spark_2.4/src/test/resources/application.conf b/nebula-exchange_spark_2.4/src/test/resources/application.conf new file mode 100644 index 00000000..3ece57a0 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/resources/application.conf @@ -0,0 +1,453 @@ +{ + # Spark relation com.vesoft.exchange.common.config + spark: { + app: { + name: Nebula Exchange 2.0 + } + + master:local + + driver: { + cores: 1 + maxResultSize: 1G + } + + executor: { + memory:1G + } + + cores:{ + max: 16 + } + } + + # if the hive is hive-on-spark with derby mode, you can ignore this hive configure + # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml + + hive: { + warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" + connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8" + connectionDriverName: "com.mysql.jdbc.Driver" + connectionUserName: "user" + connectionPassword: "password" + } + + # Nebula Graph relation com.vesoft.exchange.common.config + nebula: { + address:{ + graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] + meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"] + } + user: root + pswd: nebula + space: test + + # parameters for SST import, not required + path:{ + local:"/tmp" + remote:"/sst" + hdfs.namenode: "hdfs://name_node:9000" + } + + # nebula client connection parameters + connection { + timeout: 3000 + retry: 3 + } + + # nebula client execution parameters + execution { + retry: 3 + } + + error: { + # max number of failures, if the number of failures is bigger than max, then exit the application. + max: 32 + # failed import job will be recorded in output path + output: /tmp/errors + } + + # use google's RateLimiter to limit the requests send to NebulaGraph + rate: { + # the stable throughput of RateLimiter + limit: 1024 + # Acquires a permit from RateLimiter, unit: MILLISECONDS + # if it can't be obtained within the specified timeout, then give up the request. + timeout: 1000 + } + } + + # Processing tags + # There are tag com.vesoft.exchange.common.config examples for different dataSources. + tags: [ + + # HDFS parquet + # Import mode is client, just change type.sink to sst if you want to use sst import mode. + { + name: tag0 + type: { + source: parquet + sink: client + } + path: path0 + fields: [parquet-field-0, parquet-field-1, parquet-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field:parquet-field-0 + #policy:hash + } + batch: 256 + partition: 32 + } + + # HDFS csv + # Import mode is sst, just change type.sink to client if you want to use client import mode. + { + name: tag1 + type: { + source: csv + sink: sst + } + path: path1 + # if your csv file has no header, then use _c0,_c1,_c2,.. to indicate fields + fields: [csv-field-0, csv-field-1, csv-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field:csv-field-0 + } + separator: "|" + header: true + batch: 256 + partition: 32 + } + + # HDFS json + { + name: tag2 + type: { + source: json + sink: client + } + path: path3 + fields: [json-field-0, json-field-1, json-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field: json-field-0 + #policy: hash + } + batch: 256 + partition: 32 + } + + # Hive + { + name: tag3 + type: { + source: hive + sink: client + } + exec: "select hive-field0, hive-field1, hive-field2 from database.table" + fields: [hive-field-0, hive-field-1, hive-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field: hive-field-0 + # policy: "hash" + } + batch: 256 + partition: 32 + } + + # neo4j + { + name: tag4 + type: { + source: neo4j + sink: client + } + server: "bolt://127.0.0.1:7687" + user: neo4j + password: neo4j + exec: "match (n:label) return n.neo4j-field-0 as neo4j-field-0, n.neo4j-field-1 as neo4j-field-1 order by (n.neo4j-field-0)" + fields: [neo4j-field-0, neo4j-field-1] + nebula.fields: [nebula-field-0, nebula-field-1] + vertex: { + field:neo4j-field-0 + # policy:hash + } + partition: 10 + batch: 1000 + check_point_path: /tmp/test + } + + # HBase + # if fields or vertex contains rowkey, please configure it as "rowkey". + { + name: tag5 + type: { + source: hbase + sink: client + } + host:127.0.0.1 + port:2181 + table:hbase-table + columnFamily:hbase-table-cloumnfamily + fields: [hbase-column-0, hbase-column-1] + nebula.fields: [nebula-field-0, nebula-field-1] + vertex: { + field:rowkey + } + partition: 10 + batch: 1000 + } + + # Pulsar + { + name: tag6 + type: { + source: pulsar + sink: client + } + service: "pulsar://localhost:6650" + admin: "http://localhost:8081" + options: { + # choose one of "topic", "topics", "topicsPattern" + topics: "topic1,topic2" + } + fields: [pulsar-field-0, pulsar-field-1, pulsar-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field:pulsar-field-0 + } + partition: 10 + batch: 1000 + interval.seconds: 10 + } + + # KAFKA + # { + # name: tag7 + # type: { + # source: kafka + # sink: client + # } + # service: "kafka.service.address" + # topic: "topic-name" + # fields: [kafka-field-0, kafka-field-1, kafka-field-2] + # nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + # vertex: { + # field: kafka-field-0 + # } + # partition: 10 + # batch: 10 + # interval.seconds: 10 + # } + + # MySql + { + name: tag8 + type: { + source: mysql + sink: client + } + user:root + host: "127.0.0.1" + port: "3306" + database: database + table:table + user:root + password:nebula + sentence: "select mysql-field0, mysql-field1, mysql-field2 from database.table" + fields: [mysql-field-0, mysql-field-1, mysql-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + vertex: { + field: mysql-field-0 + # policy: "hash" + } + batch: 256 + partition: 32 + } + ] + + # Processing edges + # There are edge com.vesoft.exchange.common.config examples for different dataSources. + edges: [ + # HDFS parquet + # Import mode is client, just change type.sink to sst if you want to use sst import mode. + { + name: edge0 + type: { + source: parquet + sink: client + } + path: path0 + fields: [parquet-field-0, parquet-field-1, parquet-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: { + field:parquet-field-0 + #policy:hash + } + target: { + field:parquet-field-1 + #policy:hash + } + batch: 256 + partition: 32 + } + + # HDFS csv + { + name: edge1 + type: { + source: csv + sink: client + } + path: path1 + fields: [csv-field-0, csv-field-1, csv-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: { + field: csv-field-0 + #policy: hash + } + target: { + field: csv-field-1 + } + ranking: csv-field-2 + separator: "," + header: true + batch: 256 + partition: 32 + } + + # HDFS json + { + name: edge2 + type: { + source: json + sink: client + } + path: path2 + fields: [json-field-0, json-field-1, json-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: { + field: json-field-0 + #policy: hash + } + target: { + field: json-field-1 + } + ranking: json-field-2 + batch: 256 + partition: 32 + } + + # Hive + { + name: edge3 + type: { + source: hive + sink: client + } + exec: "select hive-field0, hive-field1, hive-field2 from database.table" + fields: [ hive-field-0, hive-field-1, hive-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: hive-field-0 + target: hive-field-1 + batch: 256 + partition: 32 + } + + # Neo4j + { + name: edge4 + type: { + source: neo4j + sink: client + } + server: "bolt://127.0.0.1:7687" + user: neo4j + password: neo4j + exec: "match (a:vertex_label)-[r:edge_label]->(b:vertex_label) return a.neo4j-source-field, b.neo4j-target-field, r.neo4j-field-0 as neo4j-field-0, r.neo4j-field-1 as neo4j-field-1 order by id(r)" + fields: [neo4j-field-0, neo4j-field-1] + nebula.fields: [nebula-field-0, nebula-field-1] + source: { + field: a.neo4j-source-field + } + target: { + field: b.neo4j-target-field + } + partition: 10 + batch: 1000 + check_point_path: /tmp/test + } + + # HBase + { + name: edge5 + type: { + source: hbase + sink: client + } + host:127.0.0.1 + port:2181 + table:hbase-table + columnFamily:hbase-table-cloumnfamily + fields: [hbase-column-0, hbase-column-1] + nebula.fields:[nebula-field-0, nebula-field-1] + source: { + field: hbase-column-k + } + target: { + field: hbase-column-h + } + partition: 10 + batch: 1000 + } + + + # Pulsar + { + name: edge6 + type: { + source: pulsar + sink: client + } + service: "pulsar://localhost:6650" + admin: "http://localhost:8081" + options: { + # choose one of "topic", "topics", "topicsPattern" + topic: "topic1" + } + fields: [pulsar-field-0, pulsar-field-1, pulsar-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: { + field: pulsar-field-0 + #policy: hash + } + target: { + field: pulsar-field-1 + } + ranking: pulsar-field-2 + partition: 10 + batch: 10 + interval.seconds: 10 + } + + # KAFKA + { + name: edge7 + type: { + source: kafka + sink: client + } + service: "kafka.service.address" + topic: "topic-name" + fields: [kafka-field-0, kafka-field-1, kafka-field-2] + nebula.fields: [nebula-field-0, nebula-field-1, nebula-field-2] + source: kafka-field-0 + target: kafka-field-1 + partition: 10 + batch: 1000 + interval.seconds: 10 + } + ] +} diff --git a/nebula-exchange_spark_2.4/src/test/resources/docker-compose.yaml b/nebula-exchange_spark_2.4/src/test/resources/docker-compose.yaml new file mode 100644 index 00000000..fb28aea4 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/resources/docker-compose.yaml @@ -0,0 +1,353 @@ +version: '3.4' +services: + metad0: + image: vesoft/nebula-metad:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.1.1 + - --ws_ip=172.28.1.1 + - --port=9559 + - --data_path=/data/meta + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.1.1:11000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9559:9559" + - 11000 + - 11002 + volumes: + - ./data/meta0:/data/meta:Z + - ./logs/meta0:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.1.1 + restart: on-failure + cap_add: + - SYS_PTRACE + + metad1: + image: vesoft/nebula-metad:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.1.2 + - --ws_ip=172.28.1.2 + - --port=9559 + - --data_path=/data/meta + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.1.2:11000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9560:9559" + - 11000 + - 11002 + volumes: + - ./data/meta1:/data/meta:Z + - ./logs/meta1:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.1.2 + restart: on-failure + cap_add: + - SYS_PTRACE + + metad2: + image: vesoft/nebula-metad:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.1.3 + - --ws_ip=172.28.1.3 + - --port=9559 + - --data_path=/data/meta + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.1.3:11000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9561:9559" + - 11000 + - 11002 + volumes: + - ./data/meta2:/data/meta:Z + - ./logs/meta2:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.1.3 + restart: on-failure + cap_add: + - SYS_PTRACE + + storaged0: + image: vesoft/nebula-storaged:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.2.1 + - --ws_ip=172.28.2.1 + - --port=9779 + - --data_path=/data/storage + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.2.1:12000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9779:9779" + - 12000 + - 12002 + volumes: + - ./data/storage0:/data/storage:Z + - ./logs/storage0:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.2.1 + restart: on-failure + cap_add: + - SYS_PTRACE + + storaged1: + image: vesoft/nebula-storaged:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.2.2 + - --ws_ip=172.28.2.2 + - --port=9779 + - --data_path=/data/storage + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.2.2:12000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9780:9779" + - 12000 + - 12002 + volumes: + - ./data/storage1:/data/storage:Z + - ./logs/storage1:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.2.2 + restart: on-failure + cap_add: + - SYS_PTRACE + + storaged2: + image: vesoft/nebula-storaged:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --local_ip=172.28.2.3 + - --ws_ip=172.28.2.3 + - --port=9779 + - --data_path=/data/storage + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.2.3:12000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9781:9779" + - 12000 + - 12002 + volumes: + - ./data/storage2:/data/storage:Z + - ./logs/storage2:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.2.3 + restart: on-failure + cap_add: + - SYS_PTRACE + + graphd0: + image: vesoft/nebula-graphd:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --port=9669 + - --ws_ip=172.28.3.1 + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.3.1:13000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9669:9669" + - 13000 + - 13002 + volumes: + - ./logs/graph0:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.3.1 + restart: on-failure + cap_add: + - SYS_PTRACE + + graphd1: + image: vesoft/nebula-graphd:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --port=9669 + - --ws_ip=172.28.3.2 + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.3.2:13000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9670:9669" + - 13000 + - 13002 + volumes: + - ./logs/graph1:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.3.2 + restart: on-failure + cap_add: + - SYS_PTRACE + + graphd2: + image: vesoft/nebula-graphd:nightly + environment: + USER: root + TZ: "${TZ}" + command: + - --meta_server_addrs=172.28.1.1:9559,172.28.1.2:9559,172.28.1.3:9559 + - --port=9669 + - --ws_ip=172.28.3.3 + - --log_dir=/logs + - --v=0 + - --minloglevel=0 + - --heartbeat_interval_secs=2 + depends_on: + - metad0 + - metad1 + - metad2 + healthcheck: + test: ["CMD", "curl", "-f", "http://172.28.3.3:13000/status"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + ports: + - "9671:9669" + - 13000 + - 13002 + volumes: + - ./logs/graph2:/logs:Z + networks: + nebula-net: + ipv4_address: 172.28.3.3 + restart: on-failure + cap_add: + - SYS_PTRACE + + console: + image: vesoft/nebula-console:nightly + entrypoint: "" + command: + - sh + - -c + - | + sleep 3 && + nebula-console -addr graphd0 -port 9669 -u root -p nebula -e 'ADD HOSTS "172.28.2.1":9779,"172.28.2.2":9779,"172.28.2.3":9779' && + sleep 36000 + depends_on: + - graphd0 + networks: + - nebula-net + +networks: + nebula-net: + ipam: + driver: default + config: + - subnet: 172.28.0.0/16 diff --git a/nebula-exchange_spark_2.4/src/test/resources/edge.csv b/nebula-exchange_spark_2.4/src/test/resources/edge.csv new file mode 100644 index 00000000..db31f2f5 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/resources/edge.csv @@ -0,0 +1,14 @@ +src,dst,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 +101,102,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) +102,103,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) +103,101,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) +104,106,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) +105,107,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) +106,108,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" +107,101,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" +108,109,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" +109,110,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" +110,-101,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" +-101,102,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" +-102,-103,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" +-103,-101,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" diff --git a/nebula-exchange_spark_2.4/src/test/resources/process_application.conf b/nebula-exchange_spark_2.4/src/test/resources/process_application.conf new file mode 100644 index 00000000..8160ab55 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/resources/process_application.conf @@ -0,0 +1,126 @@ +{ + # Spark relation com.vesoft.exchange.common.config + spark: { + app: { + name: Nebula Exchange 2.0 + } + + master:local + + driver: { + cores: 1 + maxResultSize: 1G + } + + executor: { + memory:1G + } + + cores:{ + max: 16 + } + } + + # if the hive is hive-on-spark with derby mode, you can ignore this hive configure + # get the com.vesoft.exchange.common.config values from file $HIVE_HOME/conf/hive-site.xml or hive-default.xml + + hive: { + warehouse: "hdfs://NAMENODE_IP:9000/apps/svr/hive-xxx/warehouse/" + connectionURL: "jdbc:mysql://your_ip:3306/hive_spark?characterEncoding=UTF-8" + connectionDriverName: "com.mysql.jdbc.Driver" + connectionUserName: "user" + connectionPassword: "password" + } + + # Nebula Graph relation com.vesoft.exchange.common.config + nebula: { + address:{ + graph:["127.0.0.1:9669", "127.0.0.1:9670", "127.0.0.1:9671"] + meta:["127.0.0.1:9559", "127.0.0.1:9560", "127.0.0.1:9561"] + } + user: root + pswd: nebula + space: test_string + + # parameters for SST import, not required + path:{ + local:"/tmp" + remote:"/sst" + hdfs.namenode: "hdfs://name_node:9000" + } + + # nebula client connection parameters + connection { + timeout: 3000 + retry: 3 + } + + # nebula client execution parameters + execution { + retry: 3 + } + + error: { + # max number of failures, if the number of failures is bigger than max, then exit the application. + max: 32 + # failed import job will be recorded in output path + output: /tmp/errors + } + + # use google's RateLimiter to limit the requests send to NebulaGraph + rate: { + # the stable throughput of RateLimiter + limit: 1024 + # Acquires a permit from RateLimiter, unit: MILLISECONDS + # if it can't be obtained within the specified timeout, then give up the request. + timeout: 1000 + } + } + + # Processing tags + # There are tag com.vesoft.exchange.common.config examples for different dataSources. + tags: [ + { + name: person + type: { + source: csv + sink: client + } + path: "file://src/test/resources/data.csv" + fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] + nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] + vertex: { + field:id + #policy:hash + } + header:true + batch: 2 + partition: 5 + } + ] + + # There are tag com.vesoft.exchange.common.config examples for different dataSources. + edges: [ + { + name: friend + type: { + source: csv + sink: client + } + path: "file://src/test/resources/data.csv" + fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] + nebula.fields: [col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14] + source: { + field:src + #policy:hash + } + target: { + field:dst + #policy:hash + } + header:true + batch: 2 + partition: 5 + } + ] +} diff --git a/nebula-exchange_spark_2.4/src/test/resources/vertex.csv b/nebula-exchange_spark_2.4/src/test/resources/vertex.csv new file mode 100644 index 00000000..846b1522 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/resources/vertex.csv @@ -0,0 +1,14 @@ +id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14 +101,Tom,tom,10,20,30,40,2021-01-27,2021-01-01T12:10:10,43535232,true,1.0,2.0,10:10:10,POINT(1 2) +102,Jina,Jina,11,21,31,41,2021-01-28,2021-01-02T12:10:10,43535232,false,1.1,2.1,11:10:10,POINT(3 4) +103,Tim,Tim,12,22,32,42,2021-01-29,2021-01-03T12:10:10,43535232,false,1.2,2.2,12:10:10,POINT(5 6) +104,张三,张三,13,23,33,43,2021-01-30,2021-01-04T12:10:10,43535232,true,1.3,2.3,13:10:10,POINT(6 7) +105,李四,李四,14,24,34,44,2021-02-01,2021-01-05T12:10:10,43535232,false,1.4,2.4,14:10:10,POINT(1 5) +106,王五,王五,15,25,35,45,2021-02-02,2021-01-06T12:10:10,0,false,1.5,2.5,15:10:10,"LINESTRING(1 3, 4.7 73.23)" +107,Jina,Jina,16,26,36,46,2021-02-03,2021-01-07T12:10:10,43535232,true,1.6,2.6,16:10:10,"LINESTRING(1 3, 4.7 73.23)" +108,Jina,Jina,17,27,37,47,2021-02-04,2021-01-08T12:10:10,43535232,false,1.7,2.7,17:10:10,"LINESTRING(1 3, 4.7 73.23)" +109,Jina,Jina,18,28,38,48,2021-02-05,2021-01-09T12:10:10,43535232,true,1.8,2.8,18:10:10,"LINESTRING(1 3, 4.7 73.23)" +1010,Jina,Jina,19,29,39,49,2021-02-06,2021-01-10T12:10:10,43535232,false,1.9,2.9,19:10:10,"LINESTRING(1 3, 4.7 73.23)" +-101,Jina,Jina,20,30,40,50,2021-02-07,2021-02-11T12:10:10,43535232,false,2.0,3.0,20:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" +-102,Jina,Jina,21,31,41,51,2021-02-08,2021-03-12T12:10:10,43535232,false,2.1,3.1,21:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" +-103,Jina,Jina,22,32,42,52,2021-02-09,2021-04-13T12:10:10,43535232,false,2.2,3.2,22:10:10,"POLYGON((0 1, 1 2, 2 3, 0 1))" diff --git a/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala b/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala new file mode 100644 index 00000000..7e6d6729 --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala @@ -0,0 +1,246 @@ +/* Copyright (c) 2021 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.exchange.common.processor + +import java.io.File + +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.PropertyType +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, EdgeConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.nebula.exchange.processor.EdgeProcessor +import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, EdgeItem, Schema, SchemaProp, TagItem} +import org.apache.commons.codec.binary.Hex +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{ + BooleanType, + DoubleType, + IntegerType, + LongType, + ShortType, + StringType, + StructField, + StructType +} +import org.junit.Test +import org.scalatest.Assertions.assertThrows + +import scala.collection.JavaConverters._ + +class EdgeProcessorSuite { + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) + + var data: DataFrame = null + var edgeConfig: EdgeConfigEntry = config.edgesConfig.head + val fieldKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + val nebulaKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + + val processClazz = + new EdgeProcessor(data, edgeConfig, fieldKeys, nebulaKeys, config, null, null) + @Test + def isEdgeValidSuite(): Unit = { + val stringIdValue = List("Bob", "Tom") + val intIdValue = List("11", "12") + val schema: StructType = StructType( + List(StructField("src", StringType, nullable = true), + StructField("dst", StringType, nullable = true))) + val stringIdRow = new GenericRowWithSchema(stringIdValue.toArray, schema) + val intIdRow = new GenericRowWithSchema(intIdValue.toArray, schema) + val edgeConfigEntry = EdgeConfigEntry("friend", + null, + null, + fieldKeys, + nebulaKeys, + "src", + None, + None, + "dst", + None, + false, + None, + None, + 10, + 10, + None) + + // test for string id value without policy + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, false, true)) + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, true, true)) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, true, false)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, false, false)) + + // test for int id value without policy + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, false, false)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, true, false)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, true, true)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, false, true)) + + // test for string id value with policy + val edgeConfigEntryWithPolicy = EdgeConfigEntry("friend", + null, + null, + fieldKeys, + nebulaKeys, + "src", + Some(KeyPolicy.HASH), + None, + "dst", + Some(KeyPolicy.HASH), + false, + None, + None, + 10, + 10, + None) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, true)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, true)) + + // test for int id value with policy + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, false)) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, true)) + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, false)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, true)) + } + + @Test + def convertToEdgeSuite(): Unit = { + val row = getRow() + val map = getFieldType() + val edge = processClazz.convertToEdge(row, edgeConfig, true, fieldKeys, map) + assert(edge.source.equals("\"1\"")) + assert(edge.destination.equals("\"2\"")) + assert(edge.toString.equals( + "Edge: \"1\"->\"2\"@0 values: \"\", \"fixedBob\", 12, 200, 1000, 100000, date(\"2021-01-01\"), datetime(\"2021-01-01T12:00:00.100\"), time(\"12:00:00.100\"), 345436232, true, 12.01, 22.12, ST_GeogFromText(\"POINT(3 8)\")")) + } + + @Test + def encodeEdgeSuite(): Unit = { + val row = getRow() + val columns = List( + new ColumnDef("col1".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col2".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col3".getBytes(), new ColumnTypeDef(PropertyType.INT8)), + new ColumnDef("col4".getBytes(), new ColumnTypeDef(PropertyType.INT16)), + new ColumnDef("col5".getBytes(), new ColumnTypeDef(PropertyType.INT32)), + new ColumnDef("col6".getBytes(), new ColumnTypeDef(PropertyType.INT64)), + new ColumnDef("col7".getBytes(), new ColumnTypeDef(PropertyType.DATE)), + new ColumnDef("col8".getBytes(), new ColumnTypeDef(PropertyType.DATETIME)), + new ColumnDef("col9".getBytes(), new ColumnTypeDef(PropertyType.TIME)), + new ColumnDef("col10".getBytes(), new ColumnTypeDef(PropertyType.TIMESTAMP)), + new ColumnDef("col11".getBytes(), new ColumnTypeDef(PropertyType.BOOL)), + new ColumnDef("col12".getBytes(), new ColumnTypeDef(PropertyType.DOUBLE)), + new ColumnDef("col13".getBytes(), new ColumnTypeDef(PropertyType.FLOAT)), + new ColumnDef("col14".getBytes(), new ColumnTypeDef(PropertyType.GEOGRAPHY)) + ) + val schema = new Schema(columns.asJava, new SchemaProp()) + val edgeItem = new EdgeItem(2, "friend".getBytes(), -1, schema) + val map = getFieldType() + + val (key1, key2, value) = processClazz.encodeEdge(row, 10, VidType.STRING, 10, edgeItem, map) + + val keyHex1 = Hex.encodeHexString(key1) + val keyHex2 = Hex.encodeHexString(key2) + val valueHex = Hex.encodeHexString(value) + assert( + keyHex1.equals("02060000310000000000000000000200000080000000000000003200000000000000000001")) + assert( + keyHex2.equals("0201000032000000000000000000feffffff80000000000000003100000000000000000001")) + } + + private def getRow(): Row = { + val values = List( + "1", + "2", + DEFAULT_EMPTY_VALUE, + "fixedBob", + 12, + 200, + 1000, + 100000, + "2021-01-01", + "2021-01-01T12:00:00.100", + "12:00:00.100", + "345436232", + true, + 12.01, + 22.12, + "POINT(3 8)" + ) + val schema: StructType = StructType( + List( + StructField("src", StringType, nullable = false), + StructField("dst", StringType, nullable = false), + StructField("col1", StringType, nullable = true), + StructField("col2", StringType, nullable = true), + StructField("col3", ShortType, nullable = true), + StructField("col4", ShortType, nullable = true), + StructField("col5", IntegerType, nullable = true), + StructField("col6", LongType, nullable = true), + StructField("col7", StringType, nullable = true), + StructField("col8", StringType, nullable = true), + StructField("col9", StringType, nullable = true), + StructField("col10", StringType, nullable = true), + StructField("col11", BooleanType, nullable = true), + StructField("col12", DoubleType, nullable = true), + StructField("col13", DoubleType, nullable = true), + StructField("col14", StringType, nullable = true) + )) + val row = new GenericRowWithSchema(values.toArray, schema) + row + } + + private def getFieldType(): Map[String, Int] = { + val map = Map( + "col1" -> PropertyType.STRING.getValue, + "col2" -> PropertyType.STRING.getValue, + "col3" -> PropertyType.INT8.getValue, + "col4" -> PropertyType.INT16.getValue, + "col5" -> PropertyType.INT32.getValue, + "col6" -> PropertyType.INT64.getValue, + "col7" -> PropertyType.DATE.getValue, + "col8" -> PropertyType.DATETIME.getValue, + "col9" -> PropertyType.TIME.getValue, + "col10" -> PropertyType.TIMESTAMP.getValue, + "col11" -> PropertyType.BOOL.getValue, + "col12" -> PropertyType.DOUBLE.getValue, + "col13" -> PropertyType.FLOAT.getValue, + "col14" -> PropertyType.GEOGRAPHY.getValue + ) + map + } +} diff --git a/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala b/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala new file mode 100644 index 00000000..2a19cb4c --- /dev/null +++ b/nebula-exchange_spark_2.4/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala @@ -0,0 +1,208 @@ +/* Copyright (c) 2021 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.exchange.common.processor + +import java.io.File + +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.PropertyType +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, TagConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.nebula.exchange.processor.VerticesProcessor +import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, Schema, SchemaProp, TagItem} +import org.apache.commons.codec.binary.Hex +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{ + BooleanType, + DoubleType, + IntegerType, + LongType, + ShortType, + StringType, + StructField, + StructType +} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.junit.Test +import org.scalatest.Assertions.assertThrows + +import scala.collection.JavaConverters._ + +class VerticesProcessorSuite { + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) + + var data: DataFrame = null + var tagConfig: TagConfigEntry = config.tagsConfig.head + val fieldKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + val nebulaKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + + val processClazz = + new VerticesProcessor(data, tagConfig, fieldKeys, nebulaKeys, config, null, null) + @Test + def isVertexValidSuite(): Unit = { + val stringIdValue = List("Bob") + val intIdValue = List("11") + val schema: StructType = StructType(List(StructField("id", StringType, nullable = true))) + val stringIdRow = new GenericRowWithSchema(stringIdValue.toArray, schema) + val intIdRow = new GenericRowWithSchema(intIdValue.toArray, schema) + val tagConfigEntry = TagConfigEntry("person", null, null, null, null, "id", None, 10, 10, None) + + // test for string id value without policy + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntry, false, true)) + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntry, true, true)) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntry, true, false)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntry, false, false)) + + // test for int id value without policy + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, false, false)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, true, false)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, true, true)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, false, true)) + + // test for string id value with policy + val tagConfigEntryWithPolicy = + TagConfigEntry("person", null, null, null, null, "id", Some(KeyPolicy.HASH), 10, 10, None) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, true)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, true)) + + // test for int id value with policy + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, false)) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, true)) + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, false)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, true)) + } + + @Test + def convertToVertexSuite(): Unit = { + val row = getRow() + val map = getFieldType() + val vertex = processClazz.convertToVertex(row, tagConfig, true, fieldKeys, map) + assert(vertex.vertexID.equals("\"1\"")) + assert(vertex.toString.equals( + "Vertex ID: \"1\", Values: \"\", \"fixedBob\", 12, 200, 1000, 100000, date(\"2021-01-01\"), datetime(\"2021-01-01T12:00:00.100\"), time(\"12:00:00.100\"), 345436232, true, 12.01, 22.12, ST_GeogFromText(\"POINT(3 8)\")")) + } + + @Test + def encodeVertexSuite(): Unit = { + val row = getRow() + val columns = List( + new ColumnDef("col1".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col2".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col3".getBytes(), new ColumnTypeDef(PropertyType.INT8)), + new ColumnDef("col4".getBytes(), new ColumnTypeDef(PropertyType.INT16)), + new ColumnDef("col5".getBytes(), new ColumnTypeDef(PropertyType.INT32)), + new ColumnDef("col6".getBytes(), new ColumnTypeDef(PropertyType.INT64)), + new ColumnDef("col7".getBytes(), new ColumnTypeDef(PropertyType.DATE)), + new ColumnDef("col8".getBytes(), new ColumnTypeDef(PropertyType.DATETIME)), + new ColumnDef("col9".getBytes(), new ColumnTypeDef(PropertyType.TIME)), + new ColumnDef("col10".getBytes(), new ColumnTypeDef(PropertyType.TIMESTAMP)), + new ColumnDef("col11".getBytes(), new ColumnTypeDef(PropertyType.BOOL)), + new ColumnDef("col12".getBytes(), new ColumnTypeDef(PropertyType.DOUBLE)), + new ColumnDef("col13".getBytes(), new ColumnTypeDef(PropertyType.FLOAT)), + new ColumnDef("col14".getBytes(), new ColumnTypeDef(PropertyType.GEOGRAPHY)) + ) + val schema = new Schema(columns.asJava, new SchemaProp()) + val tagItem = new TagItem(1, "person".getBytes(), -1, schema) + val map = getFieldType() + + val (key, value) = processClazz.encodeVertex(row, 10, VidType.STRING, 10, tagItem, map) + + val keyHex = Hex.encodeHexString(key) + val valueHex = Hex.encodeHexString(value) + assert(keyHex.equals("010600003100000000000000000001000000")) + } + + private def getRow(): Row = { + val values = List( + "1", + DEFAULT_EMPTY_VALUE, + "fixedBob", + 12, + 200, + 1000, + 100000, + "2021-01-01", + "2021-01-01T12:00:00.100", + "12:00:00.100", + "345436232", + true, + 12.01, + 22.12, + "POINT(3 8)" + ) + val schema: StructType = StructType( + List( + StructField("id", StringType, nullable = false), + StructField("col1", StringType, nullable = true), + StructField("col2", StringType, nullable = true), + StructField("col3", ShortType, nullable = true), + StructField("col4", ShortType, nullable = true), + StructField("col5", IntegerType, nullable = true), + StructField("col6", LongType, nullable = true), + StructField("col7", StringType, nullable = true), + StructField("col8", StringType, nullable = true), + StructField("col9", StringType, nullable = true), + StructField("col10", StringType, nullable = true), + StructField("col11", BooleanType, nullable = true), + StructField("col12", DoubleType, nullable = true), + StructField("col13", DoubleType, nullable = true), + StructField("col14", StringType, nullable = true) + )) + val row = new GenericRowWithSchema(values.toArray, schema) + row + } + + private def getFieldType(): Map[String, Int] = { + val map = Map( + "col1" -> PropertyType.STRING.getValue, + "col2" -> PropertyType.STRING.getValue, + "col3" -> PropertyType.INT8.getValue, + "col4" -> PropertyType.INT16.getValue, + "col5" -> PropertyType.INT32.getValue, + "col6" -> PropertyType.INT64.getValue, + "col7" -> PropertyType.DATE.getValue, + "col8" -> PropertyType.DATETIME.getValue, + "col9" -> PropertyType.TIME.getValue, + "col10" -> PropertyType.TIMESTAMP.getValue, + "col11" -> PropertyType.BOOL.getValue, + "col12" -> PropertyType.DOUBLE.getValue, + "col13" -> PropertyType.FLOAT.getValue, + "col14" -> PropertyType.GEOGRAPHY.getValue + ) + map + } +} diff --git a/nebula-exchange_spark_3.0/pom.xml b/nebula-exchange_spark_3.0/pom.xml new file mode 100644 index 00000000..4a9cee9d --- /dev/null +++ b/nebula-exchange_spark_3.0/pom.xml @@ -0,0 +1,401 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>exchange</artifactId> + <groupId>com.vesoft</groupId> + <version>2.5-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>nebula-exchange_spark_3.0</artifactId> + + <properties> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> + <spark.version>3.0.0</spark.version> + <version.scala.binary>2.12</version.scala.binary> + <scala.version>2.12.10</scala.version> + <spark-csv.version>1.5.0</spark-csv.version> + <scalatest.version>3.2.0</scalatest.version> + <scala-logging.version>3.9.2</scala-logging.version> + <neo.version>4.0.1</neo.version> + <gremlin.version>3.4.6</gremlin.version> + <janusgraph.version>0.5.0</janusgraph.version> + <pulsar.version>3.1.1.1</pulsar.version> + </properties> + + <build> + <testSourceDirectory>src/test</testSourceDirectory> + <plugins> + <!-- deploy-plugin --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.8.2</version> + <executions> + <execution> + <id>default-deploy</id> + <phase>deploy</phase> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <version>2.15.2</version> + <configuration> + <scalaVersion>${scala.version}</scalaVersion> + <args> + <arg>-target:jvm-1.8</arg> + </args> + <jvmArgs> + <jvmArg>-Xss4096K</jvmArg> + </jvmArgs> + </configuration> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + </goals> + <configuration> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </configuration> + </execution> + <execution> + <id>scala-test-compile</id> + <goals> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.12.4</version> + <configuration> + <includes> + <include>**/*Test.*</include> + <include>**/*Suite.*</include> + </includes> + </configuration> + </plugin> + <plugin> + <groupId>org.scalatest</groupId> + <artifactId>scalatest-maven-plugin</artifactId> + <version>2.0.0</version> + <executions> + <execution> + <id>test</id> + <goals> + <goal>test</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>3.2.1</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <artifactSet> + <excludes> + <exclude>org.apache.spark:*</exclude> + <exclude>org.apache.hadoop:*</exclude> + <exclude>org.apache.hive:*</exclude> + <exclude>log4j:log4j</exclude> + <exclude>org.apache.orc:*</exclude> + <exclude>xml-apis:xml-apis</exclude> + <exclude>javax.inject:javax.inject</exclude> + <exclude>org.spark-project.hive:hive-exec</exclude> + <exclude>stax:stax-api</exclude> + <exclude>org.glassfish.hk2.external:aopalliance-repackaged + </exclude> + </excludes> + </artifactSet> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>attach-sources</id> + <goals> + <goal>jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>4.4.0</version> + <executions> + <execution> + <id>Scaladoc</id> + <goals> + <goal>doc</goal> + </goals> + <phase>prepare-package</phase> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + <execution> + <id>attach-javadocs</id> + <goals> + <goal>doc-jar</goal> + </goals> + <configuration> + <args> + <arg>-nobootcp</arg> + <arg>-no-link-warnings</arg> + </args> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>3.2.0</version> + <configuration> + <excludePackageNames>com.facebook.thrift:com.facebook.thrift.* + </excludePackageNames> + </configuration> + <executions> + <execution> + <id>attach-javadocs</id> + <phase>package</phase> + <goals> + <goal>jar</goal> + </goals> + <configuration> + <encoding>UTF-8</encoding> + <charset>UTF-8</charset> + <additionalOptions> + <additionalparam>-source 8</additionalparam> + <additionalOption>-Xdoclint:none</additionalOption> + </additionalOptions> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.8.7</version> + <executions> + <execution> + <goals> + <goal>prepare-agent</goal> + </goals> + </execution> + <execution> + <id>report</id> + <phase>test</phase> + <goals> + <goal>report</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + + <dependencies> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-catalyst_2.12</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>jackson-core</artifactId> + <groupId>com.fasterxml.jackson.core</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-hive_2.12</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>commons-codec</artifactId> + <groupId>commons-codec</groupId> + </exclusion> + <exclusion> + <artifactId>commons-logging</artifactId> + <groupId>commons-logging</groupId> + </exclusion> + <exclusion> + <artifactId>avro</artifactId> + <groupId>org.apache.avro</groupId> + </exclusion> + <exclusion> + <artifactId>commons-compress</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>commons-lang3</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>jackson-mapper-asl</artifactId> + <groupId>org.codehaus.jackson</groupId> + </exclusion> + <exclusion> + <artifactId>antlr-runtime</artifactId> + <groupId>org.antlr</groupId> + </exclusion> + <exclusion> + <artifactId>jackson-core-asl</artifactId> + <groupId>org.codehaus.jackson</groupId> + </exclusion> + <exclusion> + <artifactId>derby</artifactId> + <groupId>org.apache.derby</groupId> + </exclusion> + <exclusion> + <artifactId>httpclient</artifactId> + <groupId>org.apache.httpcomponents</groupId> + </exclusion> + <exclusion> + <artifactId>httpcore</artifactId> + <groupId>org.apache.httpcomponents</groupId> + </exclusion> + <exclusion> + <artifactId>commons-io</artifactId> + <groupId>commons-io</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-yarn_2.12</artifactId> + <version>${spark.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + <exclusion> + <artifactId>commons-codec</artifactId> + <groupId>commons-codec</groupId> + </exclusion> + <exclusion> + <artifactId>commons-compress</artifactId> + <groupId>org.apache.commons</groupId> + </exclusion> + <exclusion> + <artifactId>activation</artifactId> + <groupId>javax.activation</groupId> + </exclusion> + <exclusion> + <artifactId>slf4j-api</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + <exclusion> + <artifactId>commons-io</artifactId> + <groupId>commons-io</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.12</artifactId> + <version>${scalatest.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>com.typesafe.scala-logging</groupId> + <artifactId>scala-logging_2.12</artifactId> + <version>${scala-logging.version}</version> + <exclusions> + <exclusion> + <artifactId>scala-library</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>scala-reflect</artifactId> + <groupId>org.scala-lang</groupId> + </exclusion> + <exclusion> + <artifactId>slf4j-api</artifactId> + <groupId>org.slf4j</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>com.thoughtworks.paranamer</groupId> + <artifactId>paranamer</artifactId> + <version>2.8</version> + </dependency> + + <!-- resolve the lib unstisfiedlinkerror for snappy 1.0.4 --> + <dependency> + <groupId>org.xerial.snappy</groupId> + <artifactId>snappy-java</artifactId> + <version>1.0.5</version> + </dependency> + <dependency> + <groupId>org.apache.spark</groupId> + <artifactId>spark-sql-kafka-0-10_2.12</artifactId> + <version>3.0.0</version> + </dependency> + + <dependency> + <groupId>io.streamnative.connectors</groupId> + <artifactId>pulsar-spark-connector_2.12</artifactId> + <version>${pulsar.version}</version> + </dependency> + + <dependency> + <groupId>com.vesoft</groupId> + <artifactId>exchange-common</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + <repositories> + <repository> + <id>SparkPackagesRepo</id> + <url>https://repos.spark-packages.org</url> + </repository> + <repository> + <id>snapshots</id> + <url>https://oss.sonatype.org/content/repositories/snapshots/</url> + </repository> + </repositories> + +</project> diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala new file mode 100644 index 00000000..3c8a4653 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/Exchange.scala @@ -0,0 +1,330 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange + +import org.apache.spark.sql.{DataFrame, SparkSession} +import java.io.File + +import com.vesoft.exchange.Argument +import com.vesoft.exchange.common.{CheckPointHandler, ErrorHandler} +import com.vesoft.exchange.common.config.{ + ClickHouseConfigEntry, + Configs, + DataSourceConfigEntry, + FileBaseSourceConfigEntry, + HBaseSourceConfigEntry, + HiveSourceConfigEntry, + JanusGraphSourceConfigEntry, + KafkaSourceConfigEntry, + MaxComputeConfigEntry, + MySQLSourceConfigEntry, + Neo4JSourceConfigEntry, + PulsarSourceConfigEntry, + SinkCategory, + SourceCategory +} +import com.vesoft.nebula.exchange.reader.{ + CSVReader, + ClickhouseReader, + HBaseReader, + HiveReader, + JSONReader, + JanusGraphReader, + KafkaReader, + MaxcomputeReader, + MySQLReader, + Neo4JReader, + ORCReader, + ParquetReader, + PulsarReader +} +import com.vesoft.exchange.common.processor.ReloadProcessor +import com.vesoft.nebula.exchange.processor.{EdgeProcessor, VerticesProcessor} +import org.apache.log4j.Logger +import org.apache.spark.SparkConf + +final case class TooManyErrorsException(private val message: String) extends Exception(message) + +/** + * SparkClientGenerator is a simple spark job used to write data into Nebula Graph parallel. + */ +object Exchange { + private[this] val LOG = Logger.getLogger(this.getClass) + + def main(args: Array[String]): Unit = { + val PROGRAM_NAME = "Nebula Graph Exchange" + val options = Configs.parser(args, PROGRAM_NAME) + val c: Argument = options match { + case Some(config) => config + case _ => + LOG.error("Argument parse failed") + sys.exit(-1) + } + + val configs = Configs.parse(new File(c.config)) + LOG.info(s"Config ${configs}") + + val session = SparkSession + .builder() + .appName(PROGRAM_NAME) + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + + for (key <- configs.sparkConfigEntry.map.keySet) { + session.config(key, configs.sparkConfigEntry.map(key)) + } + + val sparkConf = new SparkConf() + sparkConf.registerKryoClasses(Array(classOf[com.facebook.thrift.async.TAsyncClientManager])) + + // com.vesoft.exchange.common.config hive for sparkSession + if (c.hive) { + if (configs.hiveConfigEntry.isEmpty) { + LOG.info( + "you don't com.vesoft.exchange.common.config hive source, so using hive tied with spark.") + } else { + val hiveConfig = configs.hiveConfigEntry.get + sparkConf.set("spark.sql.warehouse.dir", hiveConfig.warehouse) + sparkConf + .set("javax.jdo.option.ConnectionURL", hiveConfig.connectionURL) + .set("javax.jdo.option.ConnectionDriverName", hiveConfig.connectionDriverName) + .set("javax.jdo.option.ConnectionUserName", hiveConfig.connectionUserName) + .set("javax.jdo.option.ConnectionPassword", hiveConfig.connectionPassWord) + } + } + + session.config(sparkConf) + + if (c.hive) { + session.enableHiveSupport() + } + + val spark = session.getOrCreate() + + // reload for failed import tasks + if (!c.reload.isEmpty) { + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.reload") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.reload") + + val data = spark.read.text(c.reload) + val processor = new ReloadProcessor(data, configs, batchSuccess, batchFailure) + processor.process() + LOG.info(s"batchSuccess.reload: ${batchSuccess.value}") + LOG.info(s"batchFailure.reload: ${batchFailure.value}") + sys.exit(0) + } + + // record the failed batch number + var failures: Long = 0L + + // import tags + if (configs.tagsConfig.nonEmpty) { + for (tagConfig <- configs.tagsConfig) { + LOG.info(s"Processing Tag ${tagConfig.name}") + spark.sparkContext.setJobGroup(tagConfig.name, s"Tag: ${tagConfig.name}") + + val fieldKeys = tagConfig.fields + LOG.info(s"field keys: ${fieldKeys.mkString(", ")}") + val nebulaKeys = tagConfig.nebulaFields + LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") + + val fields = tagConfig.vertexField :: tagConfig.fields + val data = createDataSource(spark, tagConfig.dataSourceConfigEntry, fields) + if (data.isDefined && !c.dry) { + val startTime = System.currentTimeMillis() + val batchSuccess = + spark.sparkContext.longAccumulator(s"batchSuccess.${tagConfig.name}") + val batchFailure = + spark.sparkContext.longAccumulator(s"batchFailure.${tagConfig.name}") + + val processor = new VerticesProcessor( + repartition(data.get, tagConfig.partition, tagConfig.dataSourceConfigEntry.category), + tagConfig, + fieldKeys, + nebulaKeys, + configs, + batchSuccess, + batchFailure) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"import for tag ${tagConfig.name} cost time: ${costTime} s") + if (tagConfig.dataSinkConfigEntry.category == SinkCategory.CLIENT) { + LOG.info(s"Client-Import: batchSuccess.${tagConfig.name}: ${batchSuccess.value}") + LOG.info(s"Client-Import: batchFailure.${tagConfig.name}: ${batchFailure.value}") + failures += batchFailure.value + } else { + LOG.info(s"SST-Import: failure.${tagConfig.name}: ${batchFailure.value}") + } + } + } + } else { + LOG.warn("Tag is not defined") + } + + // import edges + if (configs.edgesConfig.nonEmpty) { + for (edgeConfig <- configs.edgesConfig) { + LOG.info(s"Processing Edge ${edgeConfig.name}") + spark.sparkContext.setJobGroup(edgeConfig.name, s"Edge: ${edgeConfig.name}") + + val fieldKeys = edgeConfig.fields + LOG.info(s"field keys: ${fieldKeys.mkString(", ")}") + val nebulaKeys = edgeConfig.nebulaFields + LOG.info(s"nebula keys: ${nebulaKeys.mkString(", ")}") + val fields = if (edgeConfig.rankingField.isDefined) { + edgeConfig.rankingField.get :: edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields + } else { + edgeConfig.sourceField :: edgeConfig.targetField :: edgeConfig.fields + } + val data = createDataSource(spark, edgeConfig.dataSourceConfigEntry, fields) + if (data.isDefined && !c.dry) { + val startTime = System.currentTimeMillis() + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.${edgeConfig.name}") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.${edgeConfig.name}") + + val processor = new EdgeProcessor( + repartition(data.get, edgeConfig.partition, edgeConfig.dataSourceConfigEntry.category), + edgeConfig, + fieldKeys, + nebulaKeys, + configs, + batchSuccess, + batchFailure + ) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"import for edge ${edgeConfig.name} cost time: ${costTime} s") + if (edgeConfig.dataSinkConfigEntry.category == SinkCategory.CLIENT) { + LOG.info(s"Client-Import: batchSuccess.${edgeConfig.name}: ${batchSuccess.value}") + LOG.info(s"Client-Import: batchFailure.${edgeConfig.name}: ${batchFailure.value}") + failures += batchFailure.value + } else { + LOG.info(s"SST-Import: failure.${edgeConfig.name}: ${batchFailure.value}") + } + } + } + } else { + LOG.warn("Edge is not defined") + } + + // reimport for failed tags and edges + if (failures > 0 && ErrorHandler.existError(configs.errorConfig.errorPath)) { + spark.sparkContext.setJobGroup("Reload", s"Reload: ${configs.errorConfig.errorPath}") + + val batchSuccess = spark.sparkContext.longAccumulator(s"batchSuccess.reimport") + val batchFailure = spark.sparkContext.longAccumulator(s"batchFailure.reimport") + val data = spark.read.text(configs.errorConfig.errorPath) + val startTime = System.currentTimeMillis() + val processor = new ReloadProcessor(data, configs, batchSuccess, batchFailure) + processor.process() + val costTime = ((System.currentTimeMillis() - startTime) / 1000.0).formatted("%.2f") + LOG.info(s"reimport ngql cost time: ${costTime}") + LOG.info(s"batchSuccess.reimport: ${batchSuccess.value}") + LOG.info(s"batchFailure.reimport: ${batchFailure.value}") + } + spark.close() + } + + /** + * Create data source for different data type. + * + * @param session The Spark Session. + * @param config The com.vesoft.exchange.common.config. + * @return + */ + private[this] def createDataSource( + session: SparkSession, + config: DataSourceConfigEntry, + fields: List[String] + ): Option[DataFrame] = { + config.category match { + case SourceCategory.PARQUET => + val parquetConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading Parquet files from ${parquetConfig.path}""") + val reader = new ParquetReader(session, parquetConfig) + Some(reader.read()) + case SourceCategory.ORC => + val orcConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading ORC files from ${orcConfig.path}""") + val reader = new ORCReader(session, orcConfig) + Some(reader.read()) + case SourceCategory.JSON => + val jsonConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading JSON files from ${jsonConfig.path}""") + val reader = new JSONReader(session, jsonConfig) + Some(reader.read()) + case SourceCategory.CSV => + val csvConfig = config.asInstanceOf[FileBaseSourceConfigEntry] + LOG.info(s"""Loading CSV files from ${csvConfig.path}""") + val reader = + new CSVReader(session, csvConfig) + Some(reader.read()) + case SourceCategory.HIVE => + val hiveConfig = config.asInstanceOf[HiveSourceConfigEntry] + LOG.info(s"""Loading from Hive and exec ${hiveConfig.sentence}""") + val reader = new HiveReader(session, hiveConfig) + Some(reader.read()) + case SourceCategory.KAFKA => { + val kafkaConfig = config.asInstanceOf[KafkaSourceConfigEntry] + LOG.info(s"""Loading from Kafka ${kafkaConfig.server} and subscribe ${kafkaConfig.topic}""") + val reader = new KafkaReader(session, kafkaConfig, fields) + Some(reader.read()) + } + case SourceCategory.NEO4J => + val neo4jConfig = config.asInstanceOf[Neo4JSourceConfigEntry] + LOG.info(s"Loading from neo4j com.vesoft.exchange.common.config: ${neo4jConfig}") + val reader = new Neo4JReader(session, neo4jConfig) + Some(reader.read()) + case SourceCategory.MYSQL => + val mysqlConfig = config.asInstanceOf[MySQLSourceConfigEntry] + LOG.info(s"Loading from mysql com.vesoft.exchange.common.config: ${mysqlConfig}") + val reader = new MySQLReader(session, mysqlConfig) + Some(reader.read()) + case SourceCategory.PULSAR => + val pulsarConfig = config.asInstanceOf[PulsarSourceConfigEntry] + LOG.info(s"Loading from pulsar com.vesoft.exchange.common.config: ${pulsarConfig}") + val reader = new PulsarReader(session, pulsarConfig) + Some(reader.read()) + case SourceCategory.JANUS_GRAPH => + val janusGraphSourceConfigEntry = config.asInstanceOf[JanusGraphSourceConfigEntry] + val reader = new JanusGraphReader(session, janusGraphSourceConfigEntry) + Some(reader.read()) + case SourceCategory.HBASE => + val hbaseSourceConfigEntry = config.asInstanceOf[HBaseSourceConfigEntry] + val reader = new HBaseReader(session, hbaseSourceConfigEntry) + Some(reader.read()) + case SourceCategory.MAXCOMPUTE => + val maxComputeConfigEntry = config.asInstanceOf[MaxComputeConfigEntry] + val reader = new MaxcomputeReader(session, maxComputeConfigEntry) + Some(reader.read()) + case SourceCategory.CLICKHOUSE => { + val clickhouseConfigEntry = config.asInstanceOf[ClickHouseConfigEntry] + val reader = new ClickhouseReader(session, clickhouseConfigEntry) + Some(reader.read()) + } + case _ => { + LOG.error(s"Data source ${config.category} not supported") + None + } + } + } + + /** + * Repartition the data frame using the specified partition number. + * + * @param frame + * @param partition + * @return + */ + private[this] def repartition(frame: DataFrame, + partition: Int, + sourceCategory: SourceCategory.Value): DataFrame = { + if (partition > 0 && !CheckPointHandler.checkSupportResume(sourceCategory)) { + frame.repartition(partition).toDF + } else { + frame + } + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala new file mode 100644 index 00000000..bfa5dbae --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala @@ -0,0 +1,406 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.nio.ByteOrder + +import com.google.common.geometry.{S2CellId, S2LatLng} +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{Edge, Edges, KeyPolicy} +import com.vesoft.exchange.common.config.{ + Configs, + EdgeConfigEntry, + FileBaseSinkConfigEntry, + SinkCategory, + StreamingDataSourceConfigEntry +} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl +import com.vesoft.nebula.meta.EdgeItem +import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.log4j.Logger +import org.apache.spark.TaskContext +import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} +import org.apache.spark.util.LongAccumulator + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +class EdgeProcessor(data: DataFrame, + edgeConfig: EdgeConfigEntry, + fieldKeys: List[String], + nebulaKeys: List[String], + config: Configs, + batchSuccess: LongAccumulator, + batchFailure: LongAccumulator) + extends Processor { + + @transient + private[this] lazy val LOG = Logger.getLogger(this.getClass) + + private[this] val DEFAULT_MIN_CELL_LEVEL = 10 + private[this] val DEFAULT_MAX_CELL_LEVEL = 18 + + private def processEachPartition(iterator: Iterator[Edge]): Unit = { + val graphProvider = + new GraphProvider(config.databaseConfig.getGraphAddress, + config.connectionConfig.timeout, + config.sslConfig) + val writer = new NebulaGraphClientWriter(config.databaseConfig, + config.userConfig, + config.rateConfig, + edgeConfig, + graphProvider) + val errorBuffer = ArrayBuffer[String]() + + writer.prepare() + // batch write tags + val startTime = System.currentTimeMillis + iterator.grouped(edgeConfig.batch).foreach { edge => + val edges = Edges(nebulaKeys, edge.toList, edgeConfig.sourcePolicy, edgeConfig.targetPolicy) + val failStatement = writer.writeEdges(edges) + if (failStatement == null) { + batchSuccess.add(1) + } else { + errorBuffer.append(failStatement) + batchFailure.add(1) + } + } + if (errorBuffer.nonEmpty) { + ErrorHandler.save( + errorBuffer, + s"${config.errorConfig.errorPath}/${edgeConfig.name}.${TaskContext.getPartitionId}") + errorBuffer.clear() + } + LOG.info(s"edge ${edgeConfig.name} import in spark partition ${TaskContext + .getPartitionId()} cost ${System.currentTimeMillis() - startTime}ms") + writer.close() + graphProvider.close() + } + + override def process(): Unit = { + + val address = config.databaseConfig.getMetaAddress + val space = config.databaseConfig.space + + val timeout = config.connectionConfig.timeout + val retry = config.connectionConfig.retry + val metaProvider = new MetaProvider(address, timeout, retry, config.sslConfig) + val fieldTypeMap = NebulaUtils.getDataSourceFieldType(edgeConfig, space, metaProvider) + val isVidStringType = metaProvider.getVidType(space) == VidType.STRING + val partitionNum = metaProvider.getPartNumber(space) + + if (edgeConfig.dataSinkConfigEntry.category == SinkCategory.SST) { + val fileBaseConfig = edgeConfig.dataSinkConfigEntry.asInstanceOf[FileBaseSinkConfigEntry] + val namenode = fileBaseConfig.fsName.orNull + val edgeName = edgeConfig.name + + val vidType = metaProvider.getVidType(space) + val spaceVidLen = metaProvider.getSpaceVidLen(space) + val edgeItem = metaProvider.getEdgeItem(space, edgeName) + + val distintData = if (edgeConfig.rankingField.isDefined) { + data.dropDuplicates(edgeConfig.sourceField, + edgeConfig.targetField, + edgeConfig.rankingField.get) + } else { + data.dropDuplicates(edgeConfig.sourceField, edgeConfig.targetField) + } + distintData + .mapPartitions { iter => + iter.map { row => + encodeEdge(row, partitionNum, vidType, spaceVidLen, edgeItem, fieldTypeMap) + } + }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY, Encoders.BINARY)) + .flatMap(line => { + List((line._1, line._3), (line._2, line._3)) + })(Encoders.tuple(Encoders.BINARY, Encoders.BINARY)) + .toDF("key", "value") + .sortWithinPartitions("key") + .foreachPartition { iterator: Iterator[Row] => + val sstFileWriter = new NebulaSSTWriter + sstFileWriter.writeSstFiles(iterator, + fileBaseConfig, + partitionNum, + namenode, + batchFailure) + } + } else { + val streamFlag = data.isStreaming + val edgeFrame = data + .filter { row => + isEdgeValid(row, edgeConfig, streamFlag, isVidStringType) + } + .map { row => + convertToEdge(row, edgeConfig, isVidStringType, fieldKeys, fieldTypeMap) + }(Encoders.kryo[Edge]) + + // streaming write + if (streamFlag) { + val streamingDataSourceConfig = + edgeConfig.dataSourceConfigEntry.asInstanceOf[StreamingDataSourceConfigEntry] + val wStream = edgeFrame.writeStream + if (edgeConfig.checkPointPath.isDefined) + wStream.option("checkpointLocation", edgeConfig.checkPointPath.get) + + wStream + .foreachBatch((edges: Dataset[Edge], batchId: Long) => { + LOG.info(s"${edgeConfig.name} edge start batch ${batchId}.") + edges.foreachPartition(processEachPartition _) + }) + .trigger(Trigger.ProcessingTime(s"${streamingDataSourceConfig.intervalSeconds} seconds")) + .start() + .awaitTermination() + } else + edgeFrame.foreachPartition(processEachPartition _) + } + } + + private[this] def indexCells(lat: Double, lng: Double): IndexedSeq[Long] = { + val coordinate = S2LatLng.fromDegrees(lat, lng) + val s2CellId = S2CellId.fromLatLng(coordinate) + for (index <- DEFAULT_MIN_CELL_LEVEL to DEFAULT_MAX_CELL_LEVEL) + yield s2CellId.parent(index).id() + } + + /** + * filter and check row data for edge, if streaming only print log + */ + def isEdgeValid(row: Row, + edgeConfig: EdgeConfigEntry, + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val sourceFlag = checkField(edgeConfig.sourceField, + "source_field", + row, + edgeConfig.sourcePolicy, + streamFlag, + isVidStringType) + + val targetFlag = checkField(edgeConfig.targetField, + "target_field", + row, + edgeConfig.targetPolicy, + streamFlag, + isVidStringType) + + val edgeRankFlag = if (edgeConfig.rankingField.isDefined) { + val index = row.schema.fieldIndex(edgeConfig.rankingField.get) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"rank must exist and cannot be null, your row data is $row") + } + val ranking = row.get(index).toString + if (!NebulaUtils.isNumic(ranking)) { + printChoice(streamFlag, + s"Not support non-Numeric type for ranking field.your row data is $row") + false + } else true + } else true + sourceFlag && targetFlag && edgeRankFlag + } + + /** + * check if edge source id and target id valid + */ + def checkField(field: String, + fieldType: String, + row: Row, + policy: Option[KeyPolicy.Value], + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val fieldValue = if (edgeConfig.isGeo && "source_field".equals(fieldType)) { + val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get)) + val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get)) + Some(indexCells(lat, lng).mkString(",")) + } else { + val index = row.schema.fieldIndex(field) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"$fieldType must exist and cannot be null, your row data is $row") + None + } else Some(row.get(index).toString) + } + + val idFlag = fieldValue.isDefined + val policyFlag = + if (idFlag && policy.isEmpty && !isVidStringType + && !NebulaUtils.isNumic(fieldValue.get)) { + printChoice( + streamFlag, + s"space vidType is int, but your $fieldType $fieldValue is not numeric.your row data is $row") + false + } else if (idFlag && policy.isDefined && isVidStringType) { + printChoice( + streamFlag, + s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row") + false + } else true + idFlag && policyFlag + } + + /** + * convert row data to {@link Edge} + */ + def convertToEdge(row: Row, + edgeConfig: EdgeConfigEntry, + isVidStringType: Boolean, + fieldKeys: List[String], + fieldTypeMap: Map[String, Int]): Edge = { + val sourceField = processField(edgeConfig.sourceField, + "source_field", + row, + edgeConfig.sourcePolicy, + isVidStringType) + + val targetField = processField(edgeConfig.targetField, + "target_field", + row, + edgeConfig.targetPolicy, + isVidStringType) + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield extraValueForClient(row, property, fieldTypeMap) + + if (edgeConfig.rankingField.isDefined) { + val index = row.schema.fieldIndex(edgeConfig.rankingField.get) + val ranking = row.get(index).toString + Edge(sourceField, targetField, Some(ranking.toLong), values) + } else { + Edge(sourceField, targetField, None, values) + } + } + + /** + * process edge source and target field + */ + def processField(field: String, + fieldType: String, + row: Row, + policy: Option[KeyPolicy.Value], + isVidStringType: Boolean): String = { + var fieldValue = if (edgeConfig.isGeo && "source_field".equals(fieldType)) { + val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get)) + val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get)) + indexCells(lat, lng).mkString(",") + } else { + val index = row.schema.fieldIndex(field) + val value = row.get(index).toString + if (value.equals(DEFAULT_EMPTY_VALUE)) "" else value + } + // process string type vid + if (policy.isEmpty && isVidStringType) { + fieldValue = NebulaUtils.escapeUtil(fieldValue).mkString("\"", "", "\"") + } + fieldValue + } + + /** + * encode edge + */ + def encodeEdge(row: Row, + partitionNum: Int, + vidType: VidType.Value, + spaceVidLen: Int, + edgeItem: EdgeItem, + fieldTypeMap: Map[String, Int]): (Array[Byte], Array[Byte], Array[Byte]) = { + isEdgeValid(row, edgeConfig, false, vidType == VidType.STRING) + + val srcIndex: Int = row.schema.fieldIndex(edgeConfig.sourceField) + var srcId: String = row.get(srcIndex).toString + if (srcId.equals(DEFAULT_EMPTY_VALUE)) { + srcId = "" + } + + val dstIndex: Int = row.schema.fieldIndex(edgeConfig.targetField) + var dstId: String = row.get(dstIndex).toString + if (dstId.equals(DEFAULT_EMPTY_VALUE)) { + dstId = "" + } + + if (edgeConfig.sourcePolicy.isDefined) { + edgeConfig.sourcePolicy.get match { + case KeyPolicy.HASH => + srcId = MurmurHash2 + .hash64(srcId.getBytes(), srcId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${edgeConfig.sourcePolicy.get} is invalidate") + } + } + if (edgeConfig.targetPolicy.isDefined) { + edgeConfig.targetPolicy.get match { + case KeyPolicy.HASH => + dstId = MurmurHash2 + .hash64(dstId.getBytes(), dstId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${edgeConfig.targetPolicy.get} is invalidate") + } + } + + val ranking: Long = if (edgeConfig.rankingField.isDefined) { + val rankIndex = row.schema.fieldIndex(edgeConfig.rankingField.get) + row.get(rankIndex).toString.toLong + } else { + 0 + } + + val srcPartitionId = NebulaUtils.getPartitionId(srcId, partitionNum, vidType) + val dstPartitionId = NebulaUtils.getPartitionId(dstId, partitionNum, vidType) + val codec = new NebulaCodecImpl() + + import java.nio.ByteBuffer + val srcBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(srcId.toLong) + .array + } else { + srcId.getBytes() + } + + val dstBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(dstId.toLong) + .array + } else { + dstId.getBytes() + } + val positiveEdgeKey = codec.edgeKeyByDefaultVer(spaceVidLen, + srcPartitionId, + srcBytes, + edgeItem.getEdge_type, + ranking, + dstBytes) + val reverseEdgeKey = codec.edgeKeyByDefaultVer(spaceVidLen, + dstPartitionId, + dstBytes, + -edgeItem.getEdge_type, + ranking, + srcBytes) + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield + extraValueForSST(row, property, fieldTypeMap) + .asInstanceOf[AnyRef] + + val edgeValue = codec.encodeEdge(edgeItem, nebulaKeys.asJava, values.asJava) + (positiveEdgeKey, reverseEdgeKey, edgeValue) + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala new file mode 100644 index 00000000..76dc1575 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala @@ -0,0 +1,274 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.nio.ByteOrder + +import com.vesoft.exchange.common.{ErrorHandler, GraphProvider, MetaProvider, VidType} +import com.vesoft.exchange.common.{KeyPolicy, Vertex, Vertices} +import com.vesoft.exchange.common.config.{ + Configs, + FileBaseSinkConfigEntry, + SinkCategory, + StreamingDataSourceConfigEntry, + TagConfigEntry +} +import com.vesoft.exchange.common.processor.Processor +import com.vesoft.exchange.common.utils.NebulaUtils +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.exchange.common.writer.{NebulaGraphClientWriter, NebulaSSTWriter} +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.encoder.NebulaCodecImpl +import com.vesoft.nebula.meta.TagItem +import org.apache.commons.codec.digest.MurmurHash2 +import org.apache.log4j.Logger +import org.apache.spark.TaskContext +import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} +import org.apache.spark.util.LongAccumulator + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +/** + * + * @param data + * @param tagConfig + * @param fieldKeys + * @param nebulaKeys + * @param config + * @param batchSuccess + * @param batchFailure + */ +class VerticesProcessor(data: DataFrame, + tagConfig: TagConfigEntry, + fieldKeys: List[String], + nebulaKeys: List[String], + config: Configs, + batchSuccess: LongAccumulator, + batchFailure: LongAccumulator) + extends Processor { + + @transient + private[this] lazy val LOG = Logger.getLogger(this.getClass) + + private def processEachPartition(iterator: Iterator[Vertex]): Unit = { + val graphProvider = + new GraphProvider(config.databaseConfig.getGraphAddress, + config.connectionConfig.timeout, + config.sslConfig) + + val writer = new NebulaGraphClientWriter(config.databaseConfig, + config.userConfig, + config.rateConfig, + tagConfig, + graphProvider) + + val errorBuffer = ArrayBuffer[String]() + + writer.prepare() + // batch write tags + val startTime = System.currentTimeMillis + iterator.grouped(tagConfig.batch).foreach { vertex => + val vertices = Vertices(nebulaKeys, vertex.toList, tagConfig.vertexPolicy) + val failStatement = writer.writeVertices(vertices) + if (failStatement == null) { + batchSuccess.add(1) + } else { + errorBuffer.append(failStatement) + batchFailure.add(1) + } + } + if (errorBuffer.nonEmpty) { + ErrorHandler.save( + errorBuffer, + s"${config.errorConfig.errorPath}/${tagConfig.name}.${TaskContext.getPartitionId()}") + errorBuffer.clear() + } + LOG.info(s"tag ${tagConfig.name} import in spark partition ${TaskContext + .getPartitionId()} cost ${System.currentTimeMillis() - startTime} ms") + writer.close() + graphProvider.close() + } + + override def process(): Unit = { + + val address = config.databaseConfig.getMetaAddress + val space = config.databaseConfig.space + + val timeout = config.connectionConfig.timeout + val retry = config.connectionConfig.retry + val metaProvider = new MetaProvider(address, timeout, retry, config.sslConfig) + val fieldTypeMap = NebulaUtils.getDataSourceFieldType(tagConfig, space, metaProvider) + val isVidStringType = metaProvider.getVidType(space) == VidType.STRING + val partitionNum = metaProvider.getPartNumber(space) + + if (tagConfig.dataSinkConfigEntry.category == SinkCategory.SST) { + val fileBaseConfig = tagConfig.dataSinkConfigEntry.asInstanceOf[FileBaseSinkConfigEntry] + val namenode = fileBaseConfig.fsName.orNull + val tagName = tagConfig.name + val vidType = metaProvider.getVidType(space) + + val spaceVidLen = metaProvider.getSpaceVidLen(space) + val tagItem = metaProvider.getTagItem(space, tagName) + + data + .dropDuplicates(tagConfig.vertexField) + .mapPartitions { iter => + iter.map { row => + encodeVertex(row, partitionNum, vidType, spaceVidLen, tagItem, fieldTypeMap) + } + }(Encoders.tuple(Encoders.BINARY, Encoders.BINARY)) + .toDF("key", "value") + .sortWithinPartitions("key") + .foreachPartition { iterator: Iterator[Row] => + val sstFileWriter = new NebulaSSTWriter + sstFileWriter.writeSstFiles(iterator, + fileBaseConfig, + partitionNum, + namenode, + batchFailure) + } + } else { + val streamFlag = data.isStreaming + val vertices = data + .filter { row => + isVertexValid(row, tagConfig, streamFlag, isVidStringType) + } + .map { row => + convertToVertex(row, tagConfig, isVidStringType, fieldKeys, fieldTypeMap) + }(Encoders.kryo[Vertex]) + + // streaming write + if (streamFlag) { + val streamingDataSourceConfig = + tagConfig.dataSourceConfigEntry.asInstanceOf[StreamingDataSourceConfigEntry] + val wStream = vertices.writeStream + if (tagConfig.checkPointPath.isDefined) + wStream.option("checkpointLocation", tagConfig.checkPointPath.get) + + wStream + .foreachBatch((vertexSet: Dataset[Vertex], batchId: Long) => { + LOG.info(s"${tagConfig.name} tag start batch ${batchId}.") + vertexSet.foreachPartition(processEachPartition _) + }) + .trigger(Trigger.ProcessingTime(s"${streamingDataSourceConfig.intervalSeconds} seconds")) + .start() + .awaitTermination() + } else + vertices.foreachPartition(processEachPartition _) + } + } + + /** + * filter and check row data for vertex, if streaming only print log + * for not streaming datasource, if the vertex data is invalid, throw AssertException. + */ + def isVertexValid(row: Row, + tagConfig: TagConfigEntry, + streamFlag: Boolean, + isVidStringType: Boolean): Boolean = { + val index = row.schema.fieldIndex(tagConfig.vertexField) + if (index < 0 || row.isNullAt(index)) { + printChoice(streamFlag, s"vertexId must exist and cannot be null, your row data is $row") + return false + } + + val vertexId = row.get(index).toString + // process int type vid + if (tagConfig.vertexPolicy.isEmpty && !isVidStringType && !NebulaUtils.isNumic(vertexId)) { + printChoice( + streamFlag, + s"space vidType is int, but your vertex id $vertexId is not numeric.your row data is $row") + return false + } + // process string type vid + if (tagConfig.vertexPolicy.isDefined && isVidStringType) { + printChoice( + streamFlag, + s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row") + return false + } + true + } + + /** + * Convert row data to {@link Vertex} + */ + def convertToVertex(row: Row, + tagConfig: TagConfigEntry, + isVidStringType: Boolean, + fieldKeys: List[String], + fieldTypeMap: Map[String, Int]): Vertex = { + val index = row.schema.fieldIndex(tagConfig.vertexField) + var vertexId = row.get(index).toString + if (vertexId.equals(DEFAULT_EMPTY_VALUE)) { + vertexId = "" + } + + if (tagConfig.vertexPolicy.isEmpty && isVidStringType) { + vertexId = NebulaUtils.escapeUtil(vertexId).mkString("\"", "", "\"") + } + + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield extraValueForClient(row, property, fieldTypeMap) + Vertex(vertexId, values) + } + + /** + * encode vertex + */ + def encodeVertex(row: Row, + partitionNum: Int, + vidType: VidType.Value, + spaceVidLen: Int, + tagItem: TagItem, + fieldTypeMap: Map[String, Int]): (Array[Byte], Array[Byte]) = { + // check if vertex id is valid, if not, throw AssertException + isVertexValid(row, tagConfig, false, vidType == VidType.STRING) + + val index: Int = row.schema.fieldIndex(tagConfig.vertexField) + var vertexId: String = row.get(index).toString + if (vertexId.equals(DEFAULT_EMPTY_VALUE)) { + vertexId = "" + } + if (tagConfig.vertexPolicy.isDefined) { + tagConfig.vertexPolicy.get match { + case KeyPolicy.HASH => + vertexId = MurmurHash2 + .hash64(vertexId.getBytes(), vertexId.getBytes().length, 0xc70f6907) + .toString + case KeyPolicy.UUID => + throw new UnsupportedOperationException("do not support uuid yet") + case _ => + throw new IllegalArgumentException(s"policy ${tagConfig.vertexPolicy.get} is invalidate") + } + } + + val partitionId = NebulaUtils.getPartitionId(vertexId, partitionNum, vidType) + + import java.nio.ByteBuffer + val vidBytes = if (vidType == VidType.INT) { + ByteBuffer + .allocate(8) + .order(ByteOrder.nativeOrder) + .putLong(vertexId.toLong) + .array + } else { + vertexId.getBytes() + } + val codec = new NebulaCodecImpl() + val vertexKey = codec.vertexKey(spaceVidLen, partitionId, vidBytes, tagItem.getTag_id) + val values = for { + property <- fieldKeys if property.trim.length != 0 + } yield + extraValueForSST(row, property, fieldTypeMap) + .asInstanceOf[AnyRef] + val vertexValue = codec.encodeTag(tagItem, nebulaKeys.asJava, values.asJava) + (vertexKey, vertexValue) + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala new file mode 100644 index 00000000..2cd7e476 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/FileBaseReader.scala @@ -0,0 +1,115 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.config.FileBaseSourceConfigEntry +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +/** + * The FileBaseReader is the abstract class for HDFS file reader. + * + * @param session + * @param path + */ +abstract class FileBaseReader(val session: SparkSession, val path: String) extends Reader { + + require(path.trim.nonEmpty) + + override def close(): Unit = { + session.close() + } +} + +/** + * The ParquetReader extend the FileBaseReader and support read parquet file from HDFS. + * + * @param session + * @param parquetConfig + */ +class ParquetReader(override val session: SparkSession, parquetConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, parquetConfig.path) { + + override def read(): DataFrame = { + session.read.parquet(path) + } +} + +/** + * The ORCReader extend the FileBaseReader and support read orc file from HDFS. + * + * @param session + * @param orcConfig + */ +class ORCReader(override val session: SparkSession, orcConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, orcConfig.path) { + + override def read(): DataFrame = { + session.read.orc(path) + } +} + +/** + * The JSONReader extend the FileBaseReader and support read json file from HDFS. + * + * @param session + * @param jsonConfig + */ +class JSONReader(override val session: SparkSession, jsonConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, jsonConfig.path) { + + override def read(): DataFrame = { + session.read.json(path) + } +} + +/** + * The CSVReader extend the FileBaseReader and support read csv file from HDFS. + * All types of the structure are StringType. + * + * @param session + * @param csvConfig + */ +class CSVReader(override val session: SparkSession, csvConfig: FileBaseSourceConfigEntry) + extends FileBaseReader(session, csvConfig.path) { + + override def read(): DataFrame = { + session.read + .option("delimiter", csvConfig.separator.get) + .option("header", csvConfig.header.get) + .option("emptyValue", DEFAULT_EMPTY_VALUE) + .csv(path) + } +} + +/** + * The CustomReader extend the FileBaseReader and support read text file from HDFS. + * Transformation is a function convert a line into Row. + * The structure of the row should be specified. + * + * @param session + * @param customConfig + * @param transformation + * @param structType + */ +abstract class CustomReader(override val session: SparkSession, + customConfig: FileBaseSourceConfigEntry, + transformation: String => Row, + filter: Row => Boolean, + structType: StructType) + extends FileBaseReader(session, customConfig.path) { + + override def read(): DataFrame = { + val encoder = RowEncoder.apply(structType) + session.read + .text(path) + .filter(!_.getString(0).isEmpty) + .map(row => transformation(row.getString(0)))(encoder) + .filter(filter) + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala new file mode 100644 index 00000000..fb8455e6 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/Reader.scala @@ -0,0 +1,63 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.Offset +import com.vesoft.exchange.common.utils.HDFSUtils +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * The Reader is used to create a DataFrame from the source, such as Hive or HDFS. + */ +trait Reader extends Serializable { + def session: SparkSession + + def read(): DataFrame + + def close(): Unit +} + +trait CheckPointSupport extends Serializable { + + def getOffsets(totalCount: Long, + parallel: Int, + checkPointPath: Option[String], + checkPointNamePrefix: String): List[Offset] = { + if (totalCount <= 0) + throw new RuntimeException(s"${checkPointNamePrefix}: return data count<=0") + + val batchSizes = List.fill((totalCount % parallel).toInt)(totalCount / parallel + 1) ::: List + .fill((parallel - totalCount % parallel).toInt)(totalCount / parallel) + + val startOffsets = batchSizes.scanLeft(0L)(_ + _).init + + val checkPointOffsets = checkPointPath match { + case Some(path) => + val files = Range(0, parallel).map(i => s"${path}/${checkPointNamePrefix}.${i}").toList + if (files.forall(HDFSUtils.exists)) + files.map(HDFSUtils.getContent(_).trim.toLong).sorted + else startOffsets + case _ => startOffsets + } + + if (checkPointOffsets.zip(startOffsets).exists(x => x._1 < x._2)) + throw new RuntimeException( + s"Check Point file maybe previous. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") + + val eachPartitionLimit = { + batchSizes + .zip(startOffsets.zip(checkPointOffsets)) + .map(x => { + x._1 - (x._2._2 - x._2._1) + }) + } + val offsets = checkPointOffsets.zip(eachPartitionLimit).map(x => Offset(x._1, x._2)) + if (offsets.exists(_.size < 0L)) + throw new RuntimeException( + s"Check point file maybe broken. Please delete ${checkPointPath}/${checkPointNamePrefix}.* file") + offsets + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala new file mode 100644 index 00000000..4f893f12 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/ServerBaseReader.scala @@ -0,0 +1,204 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.config.{ + ClickHouseConfigEntry, + HBaseSourceConfigEntry, + HiveSourceConfigEntry, + JanusGraphSourceConfigEntry, + MaxComputeConfigEntry, + MySQLSourceConfigEntry, + Neo4JSourceConfigEntry, + ServerDataSourceConfigEntry +} +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.Result +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.mapreduce.TableInputFormat +import org.apache.hadoop.hbase.util.Bytes +import org.apache.log4j.Logger +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DataTypes, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +import scala.collection.mutable.ListBuffer + +/** + * ServerBaseReader is the abstract class of + * It include a spark session and a sentence which will sent to service. + * @param session + * @param sentence + */ +abstract class ServerBaseReader(override val session: SparkSession, val sentence: String) + extends Reader { + + override def close(): Unit = { + session.close() + } +} + +/** + * HiveReader extends the @{link ServerBaseReader}. + * The HiveReader reading data from Apache Hive via sentence. + * @param session + * @param hiveConfig + */ +class HiveReader(override val session: SparkSession, hiveConfig: HiveSourceConfigEntry) + extends ServerBaseReader(session, hiveConfig.sentence) { + override def read(): DataFrame = { + session.sql(sentence) + } +} + +/** + * The MySQLReader extends the ServerBaseReader. + * The MySQLReader reading data from MySQL via sentence. + * + * @param session + * @param mysqlConfig + */ +class MySQLReader(override val session: SparkSession, mysqlConfig: MySQLSourceConfigEntry) + extends ServerBaseReader(session, mysqlConfig.sentence) { + override def read(): DataFrame = { + val url = + s"jdbc:mysql://${mysqlConfig.host}:${mysqlConfig.port}/${mysqlConfig.database}?useUnicode=true&characterEncoding=utf-8" + val df = session.read + .format("jdbc") + .option("url", url) + .option("dbtable", mysqlConfig.table) + .option("user", mysqlConfig.user) + .option("password", mysqlConfig.password) + .load() + df.createOrReplaceTempView(mysqlConfig.table) + session.sql(sentence) + } +} + +/** + * Neo4JReader extends the ServerBaseReader + * this reader support checkpoint by sacrificing performance + * @param session + * @param neo4jConfig + */ +class Neo4JReader(override val session: SparkSession, neo4jConfig: Neo4JSourceConfigEntry) + extends ServerBaseReader(session, neo4jConfig.sentence) + with CheckPointSupport { + + @transient lazy private val LOG = Logger.getLogger(this.getClass) + + override def read(): DataFrame = { + throw new UnsupportedOperationException("neo4j datasource is not supported yet for spark 3") + } +} + +/** + * JanusGraphReader extends the link ServerBaseReader + * @param session + * @param janusGraphConfig + */ +class JanusGraphReader(override val session: SparkSession, + janusGraphConfig: JanusGraphSourceConfigEntry) + extends ServerBaseReader(session, "") + with CheckPointSupport { + + override def read(): DataFrame = { + throw new UnsupportedOperationException( + "janusgraph datasource is not supported yet for spark 3") + } +} + +/** + * + * @param session + * @param nebulaConfig + */ +class NebulaReader(override val session: SparkSession, nebulaConfig: ServerDataSourceConfigEntry) + extends ServerBaseReader(session, nebulaConfig.sentence) { + override def read(): DataFrame = ??? +} + +/** + * HBaseReader extends [[ServerBaseReader]] + * + */ +class HBaseReader(override val session: SparkSession, hbaseConfig: HBaseSourceConfigEntry) + extends ServerBaseReader(session, null) { + + private[this] val LOG = Logger.getLogger(this.getClass) + + override def read(): DataFrame = { + val cf = hbaseConfig.columnFamily + val scanConf = HBaseConfiguration.create() + scanConf.set("hbase.zookeeper.quorum", hbaseConfig.host) + scanConf.set("hbase.zookeeper.property.clientPort", hbaseConfig.port) + scanConf.set(TableInputFormat.INPUT_TABLE, hbaseConfig.table) + hbaseConfig.fields.filter(field => !field.equalsIgnoreCase("rowkey")) + scanConf.set(TableInputFormat.SCAN_COLUMNS, + hbaseConfig.fields + .filter(field => !field.equalsIgnoreCase("rowkey")) + .map(field => s"$cf:$field") + .mkString(" ")) + val fields = hbaseConfig.fields + + val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = session.sparkContext.newAPIHadoopRDD( + scanConf, + classOf[TableInputFormat], + classOf[ImmutableBytesWritable], + classOf[Result]) + + val rowRDD = hbaseRDD.map(row => { + val values: ListBuffer[String] = new ListBuffer[String] + val result: Result = row._2 + + for (i <- fields.indices) { + if (fields(i).equalsIgnoreCase("rowkey")) { + values += Bytes.toString(result.getRow) + } else { + values += Bytes.toString(result.getValue(Bytes.toBytes(cf), Bytes.toBytes(fields(i)))) + } + } + Row.fromSeq(values.toList) + }) + val schema = StructType( + fields.map(field => DataTypes.createStructField(field, DataTypes.StringType, true))) + val dataFrame = session.createDataFrame(rowRDD, schema) + dataFrame + } +} + +/** + * MaxCompute Reader + */ +class MaxcomputeReader(override val session: SparkSession, maxComputeConfig: MaxComputeConfigEntry) + extends ServerBaseReader(session, maxComputeConfig.sentence) { + + override def read(): DataFrame = { + throw new UnsupportedOperationException( + "maxcompute datasource is not supported yet for spark 3") + } +} + +/** + * Clickhouse reader + */ +class ClickhouseReader(override val session: SparkSession, + clickHouseConfigEntry: ClickHouseConfigEntry) + extends ServerBaseReader(session, clickHouseConfigEntry.sentence) { + Class.forName("ru.yandex.clickhouse.ClickHouseDriver") + override def read(): DataFrame = { + val df = session.read + .format("jdbc") + .option("driver", "ru.yandex.clickhouse.ClickHouseDriver") + .option("url", clickHouseConfigEntry.url) + .option("user", clickHouseConfigEntry.user) + .option("password", clickHouseConfigEntry.passwd) + .option("numPartitions", clickHouseConfigEntry.numPartition) + .option("query", clickHouseConfigEntry.sentence) + .load() + df + } +} diff --git a/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala new file mode 100644 index 00000000..a3640698 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/main/scala/com/vesoft/nebula/exchange/reader/StreamingBaseReader.scala @@ -0,0 +1,78 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.reader + +import com.vesoft.exchange.common.config.{KafkaSourceConfigEntry, PulsarSourceConfigEntry} +import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.{DataFrame, SparkSession} + +/** + * Spark Streaming + * + * @param session + */ +abstract class StreamingBaseReader(override val session: SparkSession) extends Reader { + + override def close(): Unit = { + session.close() + } +} + +/** + * + * @param session + * @param kafkaConfig + * @param targetFields + */ +class KafkaReader(override val session: SparkSession, + kafkaConfig: KafkaSourceConfigEntry, + targetFields: List[String]) + extends StreamingBaseReader(session) { + + require( + kafkaConfig.server.trim.nonEmpty && kafkaConfig.topic.trim.nonEmpty && targetFields.nonEmpty) + + override def read(): DataFrame = { + import org.apache.spark.sql.functions._ + import session.implicits._ + val fields = targetFields.distinct + val reader = + session.readStream + .format("kafka") + .option("kafka.bootstrap.servers", kafkaConfig.server) + .option("subscribe", kafkaConfig.topic) + .option("startingOffsets", kafkaConfig.startingOffsets) + + val maxOffsetsPerTrigger = kafkaConfig.maxOffsetsPerTrigger + if (maxOffsetsPerTrigger.isDefined) + reader.option("maxOffsetsPerTrigger", maxOffsetsPerTrigger.get) + + reader + .load() + .select($"value".cast(StringType)) + .select(json_tuple($"value", fields: _*)) + .toDF(fields: _*) + + } +} + +/** + * + * @param session + * @param pulsarConfig + */ +class PulsarReader(override val session: SparkSession, pulsarConfig: PulsarSourceConfigEntry) + extends StreamingBaseReader(session) { + + override def read(): DataFrame = { + session.readStream + .format("pulsar") + .option("service.url", pulsarConfig.serviceUrl) + .option("admin.url", pulsarConfig.adminUrl) + .options(pulsarConfig.options) + .load() + } +} diff --git a/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala b/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala new file mode 100644 index 00000000..f4431873 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/EdgeProcessorSuite.scala @@ -0,0 +1,245 @@ +/* Copyright (c) 2021 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.io.File + +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.PropertyType +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, EdgeConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, EdgeItem, Schema, SchemaProp} +import org.apache.commons.codec.binary.Hex +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{ + BooleanType, + DoubleType, + IntegerType, + LongType, + ShortType, + StringType, + StructField, + StructType +} +import org.apache.spark.sql.{DataFrame, Row} +import org.junit.Test +import org.scalatest.Assertions.assertThrows + +import scala.collection.JavaConverters._ + +class EdgeProcessorSuite { + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) + + var data: DataFrame = null + var edgeConfig: EdgeConfigEntry = config.edgesConfig.head + val fieldKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + val nebulaKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + + val processClazz = + new EdgeProcessor(data, edgeConfig, fieldKeys, nebulaKeys, config, null, null) + @Test + def isEdgeValidSuite(): Unit = { + val stringIdValue = List("Bob", "Tom") + val intIdValue = List("11", "12") + val schema: StructType = StructType( + List(StructField("src", StringType, nullable = true), + StructField("dst", StringType, nullable = true))) + val stringIdRow = new GenericRowWithSchema(stringIdValue.toArray, schema) + val intIdRow = new GenericRowWithSchema(intIdValue.toArray, schema) + val edgeConfigEntry = EdgeConfigEntry("friend", + null, + null, + fieldKeys, + nebulaKeys, + "src", + None, + None, + "dst", + None, + false, + None, + None, + 10, + 10, + None) + + // test for string id value without policy + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, false, true)) + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, true, true)) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, true, false)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntry, false, false)) + + // test for int id value without policy + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, false, false)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, true, false)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, true, true)) + assert(processClazz.isEdgeValid(intIdRow, edgeConfigEntry, false, true)) + + // test for string id value with policy + val edgeConfigEntryWithPolicy = EdgeConfigEntry("friend", + null, + null, + fieldKeys, + nebulaKeys, + "src", + Some(KeyPolicy.HASH), + None, + "dst", + Some(KeyPolicy.HASH), + false, + None, + None, + 10, + 10, + None) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, true)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, true)) + + // test for int id value with policy + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, false)) + assert(!processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, true, true)) + assert(processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, false)) + assertThrows[AssertionError]( + processClazz.isEdgeValid(stringIdRow, edgeConfigEntryWithPolicy, false, true)) + } + + @Test + def convertToEdgeSuite(): Unit = { + val row = getRow() + val map = getFieldType() + val edge = processClazz.convertToEdge(row, edgeConfig, true, fieldKeys, map) + assert(edge.source.equals("\"1\"")) + assert(edge.destination.equals("\"2\"")) + assert(edge.toString.equals( + "Edge: \"1\"->\"2\"@0 values: \"\", \"fixedBob\", 12, 200, 1000, 100000, date(\"2021-01-01\"), datetime(\"2021-01-01T12:00:00.100\"), time(\"12:00:00.100\"), 345436232, true, 12.01, 22.12, ST_GeogFromText(\"POINT(3 8)\")")) + } + + @Test + def encodeEdgeSuite(): Unit = { + val row = getRow() + val columns = List( + new ColumnDef("col1".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col2".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col3".getBytes(), new ColumnTypeDef(PropertyType.INT8)), + new ColumnDef("col4".getBytes(), new ColumnTypeDef(PropertyType.INT16)), + new ColumnDef("col5".getBytes(), new ColumnTypeDef(PropertyType.INT32)), + new ColumnDef("col6".getBytes(), new ColumnTypeDef(PropertyType.INT64)), + new ColumnDef("col7".getBytes(), new ColumnTypeDef(PropertyType.DATE)), + new ColumnDef("col8".getBytes(), new ColumnTypeDef(PropertyType.DATETIME)), + new ColumnDef("col9".getBytes(), new ColumnTypeDef(PropertyType.TIME)), + new ColumnDef("col10".getBytes(), new ColumnTypeDef(PropertyType.TIMESTAMP)), + new ColumnDef("col11".getBytes(), new ColumnTypeDef(PropertyType.BOOL)), + new ColumnDef("col12".getBytes(), new ColumnTypeDef(PropertyType.DOUBLE)), + new ColumnDef("col13".getBytes(), new ColumnTypeDef(PropertyType.FLOAT)), + new ColumnDef("col14".getBytes(), new ColumnTypeDef(PropertyType.GEOGRAPHY)) + ) + val schema = new Schema(columns.asJava, new SchemaProp()) + val edgeItem = new EdgeItem(2, "friend".getBytes(), -1, schema) + val map = getFieldType() + + val (key1, key2, value) = processClazz.encodeEdge(row, 10, VidType.STRING, 10, edgeItem, map) + + val keyHex1 = Hex.encodeHexString(key1) + val keyHex2 = Hex.encodeHexString(key2) + val valueHex = Hex.encodeHexString(value) + assert( + keyHex1.equals("02060000310000000000000000000200000080000000000000003200000000000000000001")) + assert( + keyHex2.equals("0201000032000000000000000000feffffff80000000000000003100000000000000000001")) + } + + private def getRow(): Row = { + val values = List( + "1", + "2", + DEFAULT_EMPTY_VALUE, + "fixedBob", + 12, + 200, + 1000, + 100000, + "2021-01-01", + "2021-01-01T12:00:00.100", + "12:00:00.100", + "345436232", + true, + 12.01, + 22.12, + "POINT(3 8)" + ) + val schema: StructType = StructType( + List( + StructField("src", StringType, nullable = false), + StructField("dst", StringType, nullable = false), + StructField("col1", StringType, nullable = true), + StructField("col2", StringType, nullable = true), + StructField("col3", ShortType, nullable = true), + StructField("col4", ShortType, nullable = true), + StructField("col5", IntegerType, nullable = true), + StructField("col6", LongType, nullable = true), + StructField("col7", StringType, nullable = true), + StructField("col8", StringType, nullable = true), + StructField("col9", StringType, nullable = true), + StructField("col10", StringType, nullable = true), + StructField("col11", BooleanType, nullable = true), + StructField("col12", DoubleType, nullable = true), + StructField("col13", DoubleType, nullable = true), + StructField("col14", StringType, nullable = true) + )) + val row = new GenericRowWithSchema(values.toArray, schema) + row + } + + private def getFieldType(): Map[String, Int] = { + val map = Map( + "col1" -> PropertyType.STRING.getValue, + "col2" -> PropertyType.STRING.getValue, + "col3" -> PropertyType.INT8.getValue, + "col4" -> PropertyType.INT16.getValue, + "col5" -> PropertyType.INT32.getValue, + "col6" -> PropertyType.INT64.getValue, + "col7" -> PropertyType.DATE.getValue, + "col8" -> PropertyType.DATETIME.getValue, + "col9" -> PropertyType.TIME.getValue, + "col10" -> PropertyType.TIMESTAMP.getValue, + "col11" -> PropertyType.BOOL.getValue, + "col12" -> PropertyType.DOUBLE.getValue, + "col13" -> PropertyType.FLOAT.getValue, + "col14" -> PropertyType.GEOGRAPHY.getValue + ) + map + } +} diff --git a/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala b/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala new file mode 100644 index 00000000..2340df94 --- /dev/null +++ b/nebula-exchange_spark_3.0/src/test/scala/com/vesoft/nebula/exchange/processor/VerticesProcessorSuite.scala @@ -0,0 +1,207 @@ +/* Copyright (c) 2021 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +package com.vesoft.nebula.exchange.processor + +import java.io.File + +import com.vesoft.exchange.common.VidType +import com.vesoft.nebula.PropertyType +import com.vesoft.exchange.common.KeyPolicy +import com.vesoft.exchange.common.config.{Configs, TagConfigEntry} +import com.vesoft.exchange.common.utils.NebulaUtils.DEFAULT_EMPTY_VALUE +import com.vesoft.nebula.meta.{ColumnDef, ColumnTypeDef, Schema, SchemaProp, TagItem} +import org.apache.commons.codec.binary.Hex +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{ + BooleanType, + DoubleType, + IntegerType, + LongType, + ShortType, + StringType, + StructField, + StructType +} +import org.apache.spark.sql.{DataFrame, Row} +import org.junit.Test +import org.scalatest.Assertions.assertThrows + +import scala.collection.JavaConverters._ + +class VerticesProcessorSuite { + val config: Configs = + Configs.parse(new File("../exchange-common/src/test/resources/process_application.conf")) + + var data: DataFrame = null + var tagConfig: TagConfigEntry = config.tagsConfig.head + val fieldKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + val nebulaKeys = List("col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14") + + val processClazz = + new VerticesProcessor(data, tagConfig, fieldKeys, nebulaKeys, config, null, null) + @Test + def isVertexValidSuite(): Unit = { + val stringIdValue = List("Bob") + val intIdValue = List("11") + val schema: StructType = StructType(List(StructField("id", StringType, nullable = true))) + val stringIdRow = new GenericRowWithSchema(stringIdValue.toArray, schema) + val intIdRow = new GenericRowWithSchema(intIdValue.toArray, schema) + val tagConfigEntry = TagConfigEntry("person", null, null, null, null, "id", None, 10, 10, None) + + // test for string id value without policy + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntry, false, true)) + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntry, true, true)) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntry, true, false)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntry, false, false)) + + // test for int id value without policy + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, false, false)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, true, false)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, true, true)) + assert(processClazz.isVertexValid(intIdRow, tagConfigEntry, false, true)) + + // test for string id value with policy + val tagConfigEntryWithPolicy = + TagConfigEntry("person", null, null, null, null, "id", Some(KeyPolicy.HASH), 10, 10, None) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, true)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, true)) + + // test for int id value with policy + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, false)) + assert(!processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, true, true)) + assert(processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, false)) + assertThrows[AssertionError]( + processClazz.isVertexValid(stringIdRow, tagConfigEntryWithPolicy, false, true)) + } + + @Test + def convertToVertexSuite(): Unit = { + val row = getRow() + val map = getFieldType() + val vertex = processClazz.convertToVertex(row, tagConfig, true, fieldKeys, map) + assert(vertex.vertexID.equals("\"1\"")) + assert(vertex.toString.equals( + "Vertex ID: \"1\", Values: \"\", \"fixedBob\", 12, 200, 1000, 100000, date(\"2021-01-01\"), datetime(\"2021-01-01T12:00:00.100\"), time(\"12:00:00.100\"), 345436232, true, 12.01, 22.12, ST_GeogFromText(\"POINT(3 8)\")")) + } + + @Test + def encodeVertexSuite(): Unit = { + val row = getRow() + val columns = List( + new ColumnDef("col1".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col2".getBytes(), new ColumnTypeDef(PropertyType.STRING)), + new ColumnDef("col3".getBytes(), new ColumnTypeDef(PropertyType.INT8)), + new ColumnDef("col4".getBytes(), new ColumnTypeDef(PropertyType.INT16)), + new ColumnDef("col5".getBytes(), new ColumnTypeDef(PropertyType.INT32)), + new ColumnDef("col6".getBytes(), new ColumnTypeDef(PropertyType.INT64)), + new ColumnDef("col7".getBytes(), new ColumnTypeDef(PropertyType.DATE)), + new ColumnDef("col8".getBytes(), new ColumnTypeDef(PropertyType.DATETIME)), + new ColumnDef("col9".getBytes(), new ColumnTypeDef(PropertyType.TIME)), + new ColumnDef("col10".getBytes(), new ColumnTypeDef(PropertyType.TIMESTAMP)), + new ColumnDef("col11".getBytes(), new ColumnTypeDef(PropertyType.BOOL)), + new ColumnDef("col12".getBytes(), new ColumnTypeDef(PropertyType.DOUBLE)), + new ColumnDef("col13".getBytes(), new ColumnTypeDef(PropertyType.FLOAT)), + new ColumnDef("col14".getBytes(), new ColumnTypeDef(PropertyType.GEOGRAPHY)) + ) + val schema = new Schema(columns.asJava, new SchemaProp()) + val tagItem = new TagItem(1, "person".getBytes(), -1, schema) + val map = getFieldType() + + val (key, value) = processClazz.encodeVertex(row, 10, VidType.STRING, 10, tagItem, map) + + val keyHex = Hex.encodeHexString(key) + val valueHex = Hex.encodeHexString(value) + assert(keyHex.equals("010600003100000000000000000001000000")) + } + + private def getRow(): Row = { + val values = List( + "1", + DEFAULT_EMPTY_VALUE, + "fixedBob", + 12, + 200, + 1000, + 100000, + "2021-01-01", + "2021-01-01T12:00:00.100", + "12:00:00.100", + "345436232", + true, + 12.01, + 22.12, + "POINT(3 8)" + ) + val schema: StructType = StructType( + List( + StructField("id", StringType, nullable = false), + StructField("col1", StringType, nullable = true), + StructField("col2", StringType, nullable = true), + StructField("col3", ShortType, nullable = true), + StructField("col4", ShortType, nullable = true), + StructField("col5", IntegerType, nullable = true), + StructField("col6", LongType, nullable = true), + StructField("col7", StringType, nullable = true), + StructField("col8", StringType, nullable = true), + StructField("col9", StringType, nullable = true), + StructField("col10", StringType, nullable = true), + StructField("col11", BooleanType, nullable = true), + StructField("col12", DoubleType, nullable = true), + StructField("col13", DoubleType, nullable = true), + StructField("col14", StringType, nullable = true) + )) + val row = new GenericRowWithSchema(values.toArray, schema) + row + } + + private def getFieldType(): Map[String, Int] = { + val map = Map( + "col1" -> PropertyType.STRING.getValue, + "col2" -> PropertyType.STRING.getValue, + "col3" -> PropertyType.INT8.getValue, + "col4" -> PropertyType.INT16.getValue, + "col5" -> PropertyType.INT32.getValue, + "col6" -> PropertyType.INT64.getValue, + "col7" -> PropertyType.DATE.getValue, + "col8" -> PropertyType.DATETIME.getValue, + "col9" -> PropertyType.TIME.getValue, + "col10" -> PropertyType.TIMESTAMP.getValue, + "col11" -> PropertyType.BOOL.getValue, + "col12" -> PropertyType.DOUBLE.getValue, + "col13" -> PropertyType.FLOAT.getValue, + "col14" -> PropertyType.GEOGRAPHY.getValue + ) + map + } +} diff --git a/pom.xml b/pom.xml index 1c9d080e..df55db1a 100644 --- a/pom.xml +++ b/pom.xml @@ -5,17 +5,13 @@ <modelVersion>4.0.0</modelVersion> <groupId>com.vesoft</groupId> - <artifactId>nebula-spark</artifactId> + <artifactId>exchange</artifactId> <packaging>pom</packaging> <version>2.5-SNAPSHOT</version> - <properties> - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - </properties> - <!-- More Project Information --> <name>nebula-exchange</name> - <description>Nebula Exchange </description> + <description>Nebula Exchange</description> <url>https://github.com/vesoft-inc/nebula-exchange</url> <scm> <connection>scm:git:https://github.com/vesoft-inc/nebula</connection> @@ -45,7 +41,10 @@ </developers> <modules> - <module>nebula-exchange</module> + <module>exchange-common</module> + <module>nebula-exchange_spark_2.4</module> + <module>nebula-exchange_spark_2.2</module> + <module>nebula-exchange_spark_3.0</module> </modules> <distributionManagement> @@ -61,6 +60,111 @@ </snapshotRepository> </distributionManagement> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <scala.version>2.11.12</scala.version> + <scala.binary.version>2.11</scala.binary.version> + <scopt.version>3.7.1</scopt.version> + <scala-xml.version>2.11.0-M4</scala-xml.version> + <scalatest.version>3.2.0</scalatest.version> + <scopt.version>3.7.1</scopt.version> + <s2.version>1.0.0</s2.version> + <commons-codec.version>1.14</commons-codec.version> + <hadoop.version>2.6.1</hadoop.version> + <hbase.version>1.2.0</hbase.version> + <rocksdb.version>6.7.3</rocksdb.version> + <config.version>1.4.0</config.version> + </properties> + +<dependencies> + <!-- common dependencies --> + <dependency> + <groupId>org.rocksdb</groupId> + <artifactId>rocksdbjni</artifactId> + <version>${rocksdb.version}</version> + </dependency> + <dependency> + <groupId>com.typesafe</groupId> + <artifactId>config</artifactId> + <version>${config.version}</version> + </dependency> + <dependency> + <groupId>io.sgr</groupId> + <artifactId>s2-geometry-library-java</artifactId> + <version>${s2.version}</version> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>${commons-codec.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <version>${hadoop.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase-client</artifactId> + <version>${hbase.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase-common</artifactId> + <version>${hbase.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase-server</artifactId> + <version>${hbase.version}</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-xml</artifactId> + <version>${scala-xml.version}</version> + </dependency> + <dependency> + <groupId>com.github.scopt</groupId> + <artifactId>scopt_${scala.binary.version}</artifactId> + <version>${scopt.version}</version> + </dependency> + <dependency> + <groupId>ru.yandex.clickhouse</groupId> + <artifactId>clickhouse-jdbc</artifactId> + <version>0.2.5</version> + <exclusions> + <exclusion> + <artifactId>guava</artifactId> + <groupId>com.google.guava</groupId> + </exclusion> + </exclusions> + </dependency> +</dependencies> + <profiles> <!-- Deployment profile (required so these plugins are only used when deploying) --> <profile> @@ -121,6 +225,56 @@ </plugins> </build> </profile> + + <!-- scala and spark version --> + <profile> + <id>scala-2.11</id> + <properties> + <scala.version>2.11.12</scala.version> + <scala.binary.version>2.11</scala.binary.version> + </properties> + <activation> + <activeByDefault>true</activeByDefault> + </activation> + </profile> + <profile> + <id>scala-2.12</id> + <properties> + <scala.version>2.12.10</scala.version> + <scala.binary.version>2.12</scala.binary.version> + </properties> + </profile> + <profile> + <id>spark-2.2</id> + <properties> + <spark.version>2.2.0</spark.version> + </properties> + <modules> + <module>nebula-exchange_spark_2.2</module> + </modules> + </profile> + <profile> + <id>spark-2.4</id> + <properties> + <spark.version>2.4.4</spark.version> + </properties> + <activation> + <activeByDefault>true</activeByDefault> + </activation> + <modules> + <module>nebula-exchange_spark_2.4</module> + </modules> + </profile> + + <profile> + <id>spark-3.0</id> + <properties> + <spark.version>3.0.0</spark.version> + </properties> + <modules> + <module>nebula-exchange_spark_3.0</module> + </modules> + </profile> </profiles>