From 4accdfe58bf7d862000f9365f7e1e5e94ac90e82 Mon Sep 17 00:00:00 2001 From: awang12345 Date: Mon, 19 Aug 2024 13:35:05 +0800 Subject: [PATCH] feat: support read and write from hive datasource (#100) * feat: support read and write from hive datasource * feat: connect hive by meta store * refactor: remove show dataFrame --- .../src/main/resources/application.conf | 39 ++++++- .../com/vesoft/nebula/algorithm/Main.scala | 6 +- .../nebula/algorithm/config/Configs.scala | 100 +++++++++++++++--- .../nebula/algorithm/config/SparkConfig.scala | 19 ++++ .../nebula/algorithm/reader/DataReader.scala | 28 +++++ .../nebula/algorithm/reader/ReaderType.scala | 5 +- .../nebula/algorithm/writer/AlgoWriter.scala | 43 +++++++- .../nebula/algorithm/writer/WriterType.scala | 5 +- 8 files changed, 220 insertions(+), 25 deletions(-) diff --git a/nebula-algorithm/src/main/resources/application.conf b/nebula-algorithm/src/main/resources/application.conf index e01c0fe..0fec307 100644 --- a/nebula-algorithm/src/main/resources/application.conf +++ b/nebula-algorithm/src/main/resources/application.conf @@ -11,14 +11,47 @@ } data: { - # data source. optional of nebula,nebula-ngql,csv,json + # data source. optional of nebula,nebula-ngql,csv,json,hive source: csv - # data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text + # data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text,hive sink: csv # if your algorithm needs weight hasWeight: false } + # Hive related config + hive: { + #[Optional] spark and hive require configuration on different clusters. Read and write connect hive with this metastore + metaStoreUris: "thrift://hive-metastore-server:9083" + # algo's data source from hive + read: { + #spark sql + sql: "select column_1,column_2,column_3 from database_01.table_01 " + #[Optional] graph source vid mapping with column of sql result. + srcId: "column_1" + #[Optional] graph dest vid mapping with column of sql result + dstId: "column_2" + #[Optional] graph weight mapping with column of sql result + weight: "column_3" + } + + # algo result sink into hive + write: { + #save result to hive table + dbTableName: "database_02.table_02" + #[Optional] spark dataframe save mode,optional of Append,Overwrite,ErrorIfExists,Ignore. Default is Overwrite + saveMode: "Overwrite" + #[Optional] if auto create hive table. Default is true + autoCreateTable: true + #[Optional] algorithm result mapping with hive table column name. Default same with column name of algo result dataframe + resultTableColumnMapping: { + # Note: Different algorithms have different output fields, so let's take the pagerank algorithm for example: + _id: "column_1" + pagerank: "pagerank_value" + } + } + } + # NebulaGraph related config nebula: { # algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid. @@ -78,7 +111,7 @@ # the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent, # labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount, # betweenness, graphtriangleCount, clusteringcoefficient, bfs, hanp, closeness, jaccard, node2vec] - executeAlgo: graphtrianglecount + executeAlgo: pagerank # PageRank parameter pagerank: { diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala index 0b2fc54..974f88b 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala @@ -55,7 +55,7 @@ object Main { val algoTime = System.currentTimeMillis() // writer - saveAlgoResult(algoResult, configs) + saveAlgoResult(sparkConfig.spark, algoResult, configs) val endTime = System.currentTimeMillis() sparkConfig.spark.stop() @@ -149,8 +149,8 @@ object Main { } } - private[this] def saveAlgoResult(algoResult: DataFrame, configs: Configs): Unit = { + private[this] def saveAlgoResult(spark: SparkSession, algoResult: DataFrame, configs: Configs): Unit = { val writer = AlgoWriter.make(configs) - writer.write(algoResult, configs) + writer.write(spark, algoResult, configs) } } diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala index dc906d6..6222bc6 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala @@ -12,6 +12,7 @@ import org.apache.log4j.Logger import scala.collection.JavaConverters._ import com.typesafe.config.{Config, ConfigFactory} import com.vesoft.nebula.algorithm.config.Configs.readConfig +import com.vesoft.nebula.algorithm.config.Configs.getOrElse import scala.collection.mutable @@ -129,6 +130,51 @@ object LocalConfigEntry { } } + +object HiveConfigEntry { + def apply(config: Config): HiveConfigEntry = { + //uri of hive metastore. eg: thrift://127.0.0.1:9083 + val hiveMetaStoreUris: String = getOrElse(config, "hive.metaStoreUris", "") + val readConfigEntry = buildReadConfig(config) + val writeConfigEntry = buildWriteConfig(config) + HiveConfigEntry(hiveMetaStoreUris,readConfigEntry, writeConfigEntry) + } + + def buildReadConfig(config: Config): HiveReadConfigEntry = { + //source data of spark sql + val sql: String = getOrElse(config, "hive.read.sql", "") + //the source vertex ID is mapped with the SQL result column name + val srcIdCol: String = getOrElse(config, "hive.read.srcId", "") + //the dest vertex ID is mapped with the SQL result column name + val dstIdCol: String = getOrElse(config, "hive.read.dstId", "") + //the weight is mapped with the SQL result column name + val weightCol: String = getOrElse(config, "hive.read.weight", "") + HiveReadConfigEntry(sql, srcIdCol, dstIdCol, weightCol) + } + + def buildWriteConfig(config: Config): HiveWriteConfigEntry = { + //algo result save to hive table + val dbTableName: String = getOrElse(config, "hive.write.dbTableName", "") + //save mode of spark + val saveMode: String = getOrElse(config, "hive.write.saveMode", "") + //Whether the table is automatically created + val autoCreateTable: Boolean = getOrElse(config, "hive.write.autoCreateTable", true) + //algo results dataframe column and hive table column mapping relationships + val resultColumnMapping = mutable.Map[String, String]() + val mappingKey = "hive.write.resultTableColumnMapping" + if (config.hasPath(mappingKey)) { + val mappingConfig = config.getObject(mappingKey) + for (subkey <- mappingConfig.unwrapped().keySet().asScala) { + val key = s"${mappingKey}.${subkey}" + val value = config.getString(key) + resultColumnMapping += subkey -> value + } + } + HiveWriteConfigEntry(dbTableName, saveMode, autoCreateTable, resultColumnMapping) + } + +} + /** * SparkConfigEntry support key-value pairs for spark session. * @@ -173,6 +219,34 @@ case class LocalConfigEntry(filePath: String, } } +case class HiveConfigEntry(hiveMetaStoreUris: String, + hiveReadConfigEntry: HiveReadConfigEntry, + hiveWriteConfigEntry: HiveWriteConfigEntry) { + override def toString: String = { + s"HiveConfigEntry: {hiveMetaStoreUris:$hiveMetaStoreUris, read: $hiveReadConfigEntry, write: $hiveWriteConfigEntry}" + } +} + +case class HiveReadConfigEntry(sql: String, + srcIdCol: String = "srcId", + dstIdCol: String = "dstId", + weightCol: String) { + override def toString: String = { + s"HiveReadConfigEntry: {sql: $sql, srcIdCol: $srcIdCol, dstIdCol: $dstIdCol, " + + s"weightCol:$weightCol}" + } +} + +case class HiveWriteConfigEntry(dbTableName: String, + saveMode: String, + autoCreateTable: Boolean, + resultColumnMapping: mutable.Map[String, String]) { + override def toString: String = { + s"HiveWriteConfigEntry: {dbTableName: $dbTableName, saveMode=$saveMode, " + + s"autoCreateTable=$autoCreateTable, resultColumnMapping=$resultColumnMapping}" + } +} + /** * NebulaConfigEntry * @param readConfigEntry config for nebula-spark-connector reader @@ -218,6 +292,7 @@ case class Configs(sparkConfig: SparkConfigEntry, dataSourceSinkEntry: DataSourceSinkEntry, nebulaConfig: NebulaConfigEntry, localConfigEntry: LocalConfigEntry, + hiveConfigEntry: HiveConfigEntry, algorithmConfig: AlgorithmConfigEntry) object Configs { @@ -237,10 +312,11 @@ object Configs { val dataSourceEntry = DataSourceSinkEntry(config) val localConfigEntry = LocalConfigEntry(config) val nebulaConfigEntry = NebulaConfigEntry(config) - val sparkEntry = SparkConfigEntry(config) - val algorithmEntry = AlgorithmConfigEntry(config) + val hiveConfigEntry = HiveConfigEntry(config) + val sparkEntry = SparkConfigEntry(config) + val algorithmEntry = AlgorithmConfigEntry(config) - Configs(sparkEntry, dataSourceEntry, nebulaConfigEntry, localConfigEntry, algorithmEntry) + Configs(sparkEntry, dataSourceEntry, nebulaConfigEntry, localConfigEntry, hiveConfigEntry, algorithmEntry) } /** @@ -277,15 +353,15 @@ object Configs { } /** - * Get the value from config by the path. If the path not exist, return the default value. - * - * @param config The config. - * @param path The path of the config. - * @param defaultValue The default value for the path. - * - * @return - */ - private[this] def getOrElse[T](config: Config, path: String, defaultValue: T): T = { + * Get the value from config by the path. If the path not exist, return the default value. + * + * @param config The config. + * @param path The path of the config. + * @param defaultValue The default value for the path. + * + * @return + */ + def getOrElse[T](config: Config, path: String, defaultValue: T): T = { if (config.hasPath(path)) { config.getAnyRef(path).asInstanceOf[T] } else { diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/SparkConfig.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/SparkConfig.scala index 7c863be..86c68b4 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/SparkConfig.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/SparkConfig.scala @@ -5,6 +5,8 @@ package com.vesoft.nebula.algorithm.config +import com.vesoft.nebula.algorithm.reader.ReaderType +import com.vesoft.nebula.algorithm.writer.WriterType import org.apache.spark.sql.SparkSession case class SparkConfig(spark: SparkSession, partitionNum: Int) @@ -20,12 +22,29 @@ object SparkConfig { sparkConfigs.foreach { case (key, value) => session.config(key, value) } + + // set hive config + setHiveConfig(session, configs) + val partitionNum = sparkConfigs.getOrElse("spark.app.partitionNum", "0") val spark = session.getOrCreate() validate(spark.version, "2.4.*") SparkConfig(spark, partitionNum.toInt) } + private def setHiveConfig(session: org.apache.spark.sql.SparkSession.Builder, configs: Configs): Unit = { + val dataSource = configs.dataSourceSinkEntry + if (dataSource.source.equals(ReaderType.hive.stringify) + || dataSource.sink.equals(WriterType.hive.stringify)) { + session.enableHiveSupport() + val uris = configs.hiveConfigEntry.hiveMetaStoreUris + if (uris != null && uris.trim.nonEmpty) { + session.config("hive.metastore.schema.verification", false) + session.config("hive.metastore.uris", uris) + } + } + } + private def validate(sparkVersion: String, supportedVersions: String*): Unit = { if (sparkVersion != "UNKNOWN" && !supportedVersions.exists(sparkVersion.matches)) { throw new RuntimeException( diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/DataReader.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/DataReader.scala index e11d868..872e834 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/DataReader.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/DataReader.scala @@ -25,6 +25,7 @@ object DataReader { case ReaderType.nebulaNgql => new NebulaNgqlReader case ReaderType.nebula => new NebulaReader case ReaderType.csv => new CsvReader + case ReaderType.hive => new HiveReader } .getOrElse(throw new UnsupportedOperationException("unsupported reader")) } @@ -179,3 +180,30 @@ final class JsonReader extends DataReader { data } } +final class HiveReader extends DataReader { + + override val tpe: ReaderType = ReaderType.hive + override def read(spark: SparkSession, configs: Configs, partitionNum: Int): DataFrame = { + val readConfig = configs.hiveConfigEntry.hiveReadConfigEntry + val sql = readConfig.sql + val srcIdCol = readConfig.srcIdCol + val dstIdCol = readConfig.dstIdCol + val weightCol = readConfig.weightCol + + var data = spark.sql(sql) + + if (srcIdCol != null && dstIdCol != null && srcIdCol.trim.nonEmpty && dstIdCol.trim.nonEmpty) { + if (configs.dataSourceSinkEntry.hasWeight && weightCol != null && weightCol.trim.nonEmpty) { + data = data.select(srcIdCol, dstIdCol, weightCol) + } else { + data = data.select(srcIdCol, dstIdCol) + } + } + + if (partitionNum != 0) { + data.repartition(partitionNum) + } + + data + } +} diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/ReaderType.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/ReaderType.scala index ca1d101..12fc054 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/ReaderType.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/reader/ReaderType.scala @@ -17,6 +17,7 @@ sealed trait ReaderType { case ReaderType.nebulaNgql => "nebula-ngql" case ReaderType.nebula => "nebula" case ReaderType.csv => "csv" + case ReaderType.hive => "hive" } } object ReaderType { @@ -24,10 +25,12 @@ object ReaderType { json.stringify -> json, nebulaNgql.stringify -> nebulaNgql, nebula.stringify -> nebula, - csv.stringify -> csv + csv.stringify -> csv, + hive.stringify -> hive ) object json extends ReaderType object nebulaNgql extends ReaderType object nebula extends ReaderType object csv extends ReaderType + object hive extends ReaderType } diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/AlgoWriter.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/AlgoWriter.scala index e4da34d..3fcb8ce 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/AlgoWriter.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/AlgoWriter.scala @@ -8,11 +8,11 @@ package com.vesoft.nebula.algorithm.writer import com.vesoft.nebula.connector.connector.NebulaDataFrameWriter import com.vesoft.nebula.connector.{NebulaConnectionConfig, WriteMode, WriteNebulaVertexConfig} import com.vesoft.nebula.algorithm.config.{AlgoConstants, Configs} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} abstract class AlgoWriter { val tpe:WriterType - def write(data: DataFrame, configs: Configs): Unit + def write(spark: SparkSession, data: DataFrame, configs: Configs): Unit } object AlgoWriter { def make(configs: Configs): AlgoWriter = { @@ -20,6 +20,7 @@ object AlgoWriter { case WriterType.text => new TextWriter case WriterType.nebula => new NebulaWriter case WriterType.csv => new CsvWriter + case WriterType.hive => new HiveWriter }.getOrElse(throw new UnsupportedOperationException("unsupported writer")) } @@ -27,7 +28,7 @@ object AlgoWriter { final class NebulaWriter extends AlgoWriter { override val tpe: WriterType = WriterType.nebula - override def write(data: DataFrame, configs: Configs): Unit = { + override def write(spark: SparkSession, data: DataFrame, configs: Configs): Unit = { val graphAddress = configs.nebulaConfig.writeConfigEntry.graphAddress val metaAddress = configs.nebulaConfig.writeConfigEntry.metaAddress val space = configs.nebulaConfig.writeConfigEntry.space @@ -61,7 +62,7 @@ final class NebulaWriter extends AlgoWriter { final class CsvWriter extends AlgoWriter { override val tpe: WriterType = WriterType.csv - override def write(data: DataFrame, configs: Configs): Unit = { + override def write(spark: SparkSession, data: DataFrame, configs: Configs): Unit = { val resultPath = configs.localConfigEntry.resultPath data.write.option("header", true).csv(resultPath) } @@ -69,8 +70,40 @@ final class CsvWriter extends AlgoWriter { final class TextWriter extends AlgoWriter { override val tpe: WriterType = WriterType.text - override def write(data: DataFrame, configs: Configs): Unit = { + override def write(spark: SparkSession, data: DataFrame, configs: Configs): Unit = { val resultPath = configs.localConfigEntry.resultPath data.write.option("header", true).text(resultPath) } } + +final class HiveWriter extends AlgoWriter { + override val tpe: WriterType = WriterType.hive + override def write(spark: SparkSession, data: DataFrame, configs: Configs): Unit = { + val config = configs.hiveConfigEntry.hiveWriteConfigEntry + val saveMode = SaveMode.values().find(_.name.equalsIgnoreCase(config.saveMode)).getOrElse(SaveMode.Append) + val columnMapping = config.resultColumnMapping + + var _data = data + columnMapping.map{ + case (from, to) => + _data = _data.withColumnRenamed(from, to) + } + + if(config.autoCreateTable){ + val createTableStatement = generateCreateTableStatement(_data, config.dbTableName) + println(s"execute create hive table statement:${createTableStatement}") + spark.sql(createTableStatement) + } + + _data.write.mode(saveMode).insertInto(config.dbTableName) + } + + def generateCreateTableStatement(df: DataFrame, tableName: String): String = { + val columns = df.schema.fields + val columnDefinitions = columns.map { field => + s"${field.name} ${field.dataType.typeName}" + }.mkString(",\n ") + s"CREATE TABLE IF NOT EXISTS $tableName (\n $columnDefinitions\n)" + } + +} diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/WriterType.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/WriterType.scala index 84a7839..1a81497 100644 --- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/WriterType.scala +++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/writer/WriterType.scala @@ -16,15 +16,18 @@ sealed trait WriterType { case WriterType.text => "text" case WriterType.nebula => "nebula" case WriterType.csv => "csv" + case WriterType.hive => "hive" } } object WriterType { lazy val mapping: Map[String, WriterType] = Map( text.stringify -> text, nebula.stringify -> nebula, - csv.stringify -> csv + csv.stringify -> csv, + hive.stringify -> hive ) object text extends WriterType object nebula extends WriterType object csv extends WriterType + object hive extends WriterType }