最近在评估nebula给公司产品做商业选型用,使用了下nebula-algorithm,发现一些提交spark错误问题。如下,希望能够给出解决方案。
参考步骤:Nebula Algorithm - Nebula Graph Database 手册
错误:
2022-05-31 17:40:16,459 WARN [main] util.Utils (Logging.scala:logWarning(66)) - Your hostname, bonelee-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2022-05-31 17:40:16,461 WARN [main] util.Utils (Logging.scala:logWarning(66)) - Set SPARK_LOCAL_IP if you need to bind to another address
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V
at org.apache.hadoop.conf.Configuration.set(Configuration.java:1357)
at org.apache.hadoop.conf.Configuration.set(Configuration.java:1338)
at org.apache.spark.deploy.SparkHadoopUtil$.org$apache$spark$deploy$SparkHadoopUtil$$appendS3AndSparkHadoopConfigurations(SparkHadoopUtil.scala:464)
at org.apache.spark.deploy.SparkHadoopUtil$.newConfiguration(SparkHadoopUtil.scala:436)
at org.apache.spark.deploy.SparkSubmit$$anonfun$2.apply(SparkSubmit.scala:323)
at org.apache.spark.deploy.SparkSubmit$$anonfun$2.apply(SparkSubmit.scala:323)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:323)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:784)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:930)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:939)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
提交方式:
spark-submit --master "local" \
--conf spark.app.name="g1" \
--conf spark.executor.extraLibraryPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \
--conf spark.executor.extraClassPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \
--driver-class-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \
--driver-library-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \
--class com.vesoft.nebula.algorithm.Main nebula-algorithm/target/nebula-algorithm-3.0.0.jar -p /home/bonelee/Desktop/nebula-algorithm/application.conf
版本说明
Linux ubuntu:1804 LTS
Scala 2.11.12 (Java HotSpot™ 64-Bit Server VM, Java 1.8.0_333).
Spark: 2.4.8-bin-without-hadoop
hadoop:3.2.3
nebula:3.0.0
nebula-algorithm:3.0.0
application配置:
{
# Spark relation config
spark: {
app: {
name: LPA
# spark.app.partitionNum
partitionNum:100
}
master:local
}
data: {
# data source. optional of nebula,csv,json
source: nebula
# data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text
sink: csv
# if your algorithm needs weight
hasWeight: false
}
# Nebula Graph relation config
nebula: {
# algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid.
read: {
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "127.0.0.1:9559"
# Nebula space
space: basketballplayer
# Nebula edge types, multiple labels means that data from multiple edges will union together
labels: ["serve"]
# Nebula edge property name for each edge type, this property will be as weight col for algorithm.
# Make sure the weightCols are corresponding to labels.
weightCols: ["start_year"]
}
# algo result sink into Nebula. If data.sink is nebula, then this nebula.write config can be valid.
write:{
# Nebula graphd server address, multiple addresses are split by English comma
graphAddress: "127.0.0.1:9669"
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "127.0.0.1:9559"
user:root
pswd:nebula
# Nebula space name
space:nb
# Nebula tag name, the algorithm result will be write into this tag
tag:pagerank
# algorithm result is insert into new tag or update to original tag. type: insert/update
type:insert
}
}
local: {
# algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid.
read:{
filePath: "file:///tmp/algo_edge.csv"
srcId:"src"
# dstId column
dstId:"dst"
# weight column
weight: "weight"
# if csv file has header
header: true
# csv file's delimiter
delimiter:","
}
# algo result sink into local file. If data.sink is csv or text, then this local.write can be valid.
write:{
resultPath:/tmp/count
}
}
algorithm: {
# the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent,
# labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount,
# betweenness, graphtriangleCount, clusteringcoefficient, bfs, hanp, closeness, jaccard, node2vec]
executeAlgo: graphtrianglecount
# PageRank parameter
pagerank: {
maxIter: 10
resetProb: 0.15 # default 0.15
}
# Louvain parameter
louvain: {
maxIter: 20
internalIter: 10
tol: 0.5
}
# connected component parameter.
connectedcomponent: {
maxIter: 20
}
# LabelPropagation parameter
labelpropagation: {
maxIter: 20
}
# ShortestPaths parameter
shortestpaths: {
# several vertices to compute the shortest path to all vertices.
landmarks: "1"
}
# Vertex degree statistics parameter
degreestatic: {}
# KCore parameter
kcore:{
maxIter:10
degree:1
}
# Trianglecount parameter
trianglecount:{}
# graphTriangleCount parameter
graphtrianglecount:{}
# Betweenness centrality parameter. maxIter parameter means the max times of iterations.
betweenness:{
maxIter:5
}
# Clustering Coefficient parameter. The type parameter has two choice, local or global
# local type will compute the clustering coefficient for each vertex, and print the average coefficient for graph.
# global type just compute the graph's clustering coefficient.
clusteringcoefficient:{
type: local
}
# ClosenessAlgo parameter
closeness:{}
# BFS parameter
bfs:{
maxIter:5
root:"10"
}
# HanpAlgo parameter
hanp:{
hopAttenuation:0.1
maxIter:10
preference:1.0
}
#Node2vecAlgo parameter
node2vec:{
maxIter: 10,
lr: 0.025,
dataNumPartition: 10,
modelNumPartition: 10,
dim: 10,
window: 3,
walkLength: 5,
numWalks: 3,
p: 1.0,
q: 1.0,
directed: false,
degree: 30,
embSeparate: ",",
modelPath: "hdfs://127.0.0.1:9000/model"
}
# JaccardAlgo parameter
jaccard:{
tol: 1.0
}
}
}
怀疑是guava版本问题。。。
如果将上述数据源nebula修改为csv,简单运行:
/home/bonelee/spark-2.4.8-bin-without-hadoop/bin/spark-submit --master “local”
–class com.vesoft.nebula.algorithm.Main nebula-algorithm/target/nebula-algorithm-3.0.0.jar -p /home/bonelee/Desktop/nebula-algorithm/application.conf
可得出正确结果。
但这肯定不是我想要的,我希望数据源依然是nebula。