提问参考模版:
- nebula 版本:2.0.1
- 部署方式(分布式 rpm):3meta ,6storage,6graph
- 是否为线上版本:Y / N
- 硬件信息
- 磁盘ssd
- CPU 32C、64G内存信息
- 问题的具体描述
同样数据量用nebualGraph 2.0.0 spark 导入数据,比1.0慢一倍??请问什么原因,还是2.0 就是比1.0慢??
数据量:1亿节点,5亿边
1.0 spark 导入1.77h
2.0 spark 导入 3.19h
相关配置如下
##--queue root.ogp-db.import \
${SPARK_HOME}/bin/spark-submit \
--queue root.ipd.daily \
--name "nebula2.0-import-$taskName" \
--master yarn \
--driver-cores 26 \
--driver-memory 32g \
--executor-memory 32g \
--deploy-mode cluster \
--num-executors 48 \
--executor-cores 20 \
--conf spark.port.maxRetries=3 \
--conf spark.yarn.maxAppAttempts=3 \
--conf spark.executor.memoryOverhead=8g \
--conf spark.driver.memoryOverhead=8g \
--conf spark.hadoop.fs.defaultFS="$ALG_HDFS" \
--conf spark.default.parallelism=48 \
--conf spark.executor.extraJavaOptions="-XX:MaxDirectMemorySize=7372m" \
--files "$nebulaConf" \
--class com.vesoft.nebula.exchange.Exchange \
${baseJarPath}/nebula-exchange-2.0.0.jar -c nebula-import.conf -h -d
}
{
# Spark 相关信息配置
# 参见: http://spark.apache.org/docs/latest/configuration.html
spark: {
app: {
name: Spark Writer
}
driver: {
cores: 16
maxResultSize: 16G
}
cores {
max: 16
}
}
# Nebula Graph 相关信息配置
nebula: {
# 查询引擎 IP 列表
address: {
graph: ["{{ graph | join('", "') }}"]
meta: ["{{ meta | join('", "') }}"]
}
# 连接 Nebula Graph 服务的用户名和密码
user: {{ user }}
pswd: {{ password }}
# Nebula Graph 图空间名称
space: {{ space }}
# thrift 超时时长及重试次数
# 如未设置,则默认值分别为 3000 和 3
connection {
timeout: 10000000
retry: 10
}
# nGQL 查询重试次数
# 如未设置,则默认值为 3
execution {
retry: 10
}
error: {
max: 32
output: /tmp/errors
}
rate: {
limit: 1024
timeout: 10000000
}
}
# 处理标签
tags: [
# 与上述类似
# 从 Hive 加载将执行命令 $ {exec} 作为数据集
{% for vertex in vertex_list %}{
name: {{ vertex['name'] }}
type: {
source: hive
sink: client
}
exec: "select {{ (([vertex['id_name']] | list) + vertex['hive_fields']) | join(', ') }} from {{ vertex['table'] }} where ds = '{{ vertex['ds'] }}'"
fields: [{{ vertex['hive_fields'] | join(', ') }}]
nebula.fields: [{{ vertex['nebula_fields'] | join(', ') }}]
vertex: {{ vertex['id_name'] }}
isImplicit: true
batch: 384
partition: 48
}
{% endfor %}
]
# 处理边
edges: [
# 从 Hive 加载将执行命令 $ {exec} 作为数据集
# 边权重为可选
{% for edge in edge_list %}{
name: {{ edge['name'] }}
type: {
source: hive
sink: client
}
exec: "select {{ (([edge['source_id'], edge['target_id']] | list) + edge['hive_fields']) | join(', ') }} from {{ edge['table'] }} where ds = '{{ edge['ds'] }}' and edge_label = '{{ edge['name'] }}'"
fields: [{{ edge['hive_fields'] | join(', ') }}]
nebula.fields: [{{ edge['nebula_fields'] | join(', ') }}]
source: {{ edge['source_id'] }}
target: {{ edge['target_id'] }}{% if edge['rank'] %}
ranking: {{ edge['rank'] }}{% endif %}
isImplicit: true
batch: 384
partition: 48
}
{% endfor %}
]
}