执行UPDATE VERTEX ... YIELD 字段,导致storaged crash

执行以下语句到导致 storaged crash

UPDATE  VERTEX ON `aa` 'a:1005'
	SET
		`name` = 'test'
YIELD 'a:1005' AS `aId`, `name` 这种语法

测试版本:docker-compone 3.1.0,3.2.0,3.3.0, nightly, 2.6.6

其他说明:nightly 中,YIELD aIdname 删除 “`”,可以正常执行。

关联问题:使用UPDATE YIELD时: -1005:RPC failure in StorageClient: Channel got EOF

相关的数据量大概多大,还有机器配置也贴一下。

测试数据:

CREATE SPACE `test` (partition_num = 20, replica_factor = 1, vid_type = FIXED_STRING(200));

CREATE tag `aa` (`name` string NULL  COMMENT "名称");

INSERT VERTEX  aa(name) VALUES 'a:1005':('test');

UPDATE  VERTEX ON `aa` 'a:1005'
	SET
		`name` = 'test'
YIELD 'a:1005' AS `aId`, `name`;

studio 报错信息

-1005:RPC failure in StorageClient: Channel got EOF. Check for server hitting connection limit, server connection idle timeout, and server crashes.

docker-compose.yaml(以3.1.0为例子)

version: '3.4'
services:
  metad0:
    image: vesoft/nebula-metad:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=metad0
      - --ws_ip=metad0
      - --port=9559
      - --ws_http_port=19559
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://metad0:19559/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9559
      - 19559
      - 19560
    volumes:
      - ./data/meta0:/data/meta
      - ./logs/meta0:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  metad1:
    image: vesoft/nebula-metad:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=metad1
      - --ws_ip=metad1
      - --port=9559
      - --ws_http_port=19559
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://metad1:19559/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9559
      - 19559
      - 19560
    volumes:
      - ./data/meta1:/data/meta
      - ./logs/meta1:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  metad2:
    image: vesoft/nebula-metad:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=metad2
      - --ws_ip=metad2
      - --port=9559
      - --ws_http_port=19559
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://metad2:19559/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9559
      - 19559
      - 19560
    volumes:
      - ./data/meta2:/data/meta
      - ./logs/meta2:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  storaged0:
    image: vesoft/nebula-storaged:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=storaged0
      - --ws_ip=storaged0
      - --port=9779
      - --ws_http_port=19779
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://storaged0:19779/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9779
      - 19779
      - 19780
    volumes:
      - ./data/storage0:/data/storage
      - ./logs/storage0:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  storaged1:
    image: vesoft/nebula-storaged:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=storaged1
      - --ws_ip=storaged1
      - --port=9779
      - --ws_http_port=19779
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://storaged1:19779/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9779
      - 19779
      - 19780
    volumes:
      - ./data/storage1:/data/storage
      - ./logs/storage1:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  storaged2:
    image: vesoft/nebula-storaged:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --local_ip=storaged2
      - --ws_ip=storaged2
      - --port=9779
      - --ws_http_port=19779
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://storaged2:19779/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9779
      - 19779
      - 19780
    volumes:
      - ./data/storage2:/data/storage
      - ./logs/storage2:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  graphd:
    image: vesoft/nebula-graphd:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --port=9669
      - --local_ip=graphd
      - --ws_ip=graphd
      - --ws_http_port=19669
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - storaged0
      - storaged1
      - storaged2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://graphd:19669/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - "9669:9669"
      - 19669
      - 19670
    volumes:
      - ./logs/graph:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  graphd1:
    image: vesoft/nebula-graphd:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --port=9669
      - --local_ip=graphd1
      - --ws_ip=graphd1
      - --ws_http_port=19669
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - storaged0
      - storaged1
      - storaged2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://graphd1:19669/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9669
      - 19669
      - 19670
    volumes:
      - ./logs/graph1:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  graphd2:
    image: vesoft/nebula-graphd:v3.1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
      - --port=9669
      - --local_ip=graphd2
      - --ws_ip=graphd2
      - --ws_http_port=19669
      - --log_dir=/logs
      - --v=0
      - --minloglevel=0
    depends_on:
      - storaged0
      - storaged1
      - storaged2
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://graphd2:19669/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - 9669
      - 19669
      - 19670
    volumes:
      - ./logs/graph2:/logs
    networks:
      - nebula-net
    restart: on-failure
    cap_add:
      - SYS_PTRACE

  console:
    image: vesoft/nebula-console:v3.0.0
    entrypoint: ""
    platform: linux/amd64
    command:
      - sh
      - -c
      - |
        for i in `seq 1 60`;do
          var=`nebula-console -addr graphd -port 9669 -u root -p nebula -e 'ADD HOSTS "storaged0":9779,"storaged1":9779,"storaged2":9779'`;
          if [[ $$? == 0 ]];then
            break;
          fi;
          sleep 1;
          echo "retry to add hosts.";
        done && tail -f /dev/null;

    depends_on:
      - graphd
    networks:
      - nebula-net
  studio:
    image: vesoft/nebula-graph-studio:v3.3.0
    entrypoint: ""
    platform: linux/amd64
    ports:
      - "7001:7001"
    depends_on:
      - graphd
    networks:
      - nebula-net

networks:
  nebula-net:

除了2.6.2,其他替换docker版本即可

storaged crash 的 corefile 能帮忙捉一下吗,方法可以参考:linux core dump 文件 gdb分析 - ThinkDiff - 博客园

docker-compose up -d,docker-compose启动之后,然后跑一下测试的语句,这个问题就能复现了,复现不了么?

:thinking: benma 方便的话,可以抓下 core dump,可以节省下研发搭建类似环境的时间~ 谢谢啦。

(gdb) bt
#0  0x00007f7df351a079 in vfprintf () from /lib64/libc.so.6
#1  0x00007f7df3545179 in vsnprintf () from /lib64/libc.so.6
#2  0x000000000242ec73 in ?? ()
#3  0x00000000024300f7 in folly::stringVPrintf[abi:cxx11](char const*, __va_list_tag*) ()
#4  0x000000000243029f in folly::stringPrintf[abi:cxx11](char const*, ...) ()
#5  0x00000000011af3ba in nebula::storage::UpdateResNode<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::doExecute(int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#6  0x000000000119ef70 in nebula::storage::RelNode<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::execute(int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#7  0x000000000119ed26 in nebula::storage::RelNode<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::doExecute(int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#8  0x000000000119ef70 in nebula::storage::RelNode<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::execute(int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#9  0x000000000119dfe0 in nebula::storage::UpdateVertexProcessor::doProcess(nebula::storage::cpp2::UpdateVertexRequest const&) ()
#10 0x00000000020bbcb7 in virtual thunk to apache::thrift::concurrency::FunctionRunner::run() ()
#11 0x0000000002218768 in apache::thrift::concurrency::ThreadManager::Impl::Worker::run() ()
#12 0x000000000221a86e in apache::thrift::concurrency::PthreadThread::threadMain(void*) ()
#13 0x00007f7df38a2ea5 in start_thread () from /lib64/libpthread.so.0
#14 0x00007f7df35cbb0d in clone () from /lib64/libc.so.6
1 个赞

目测是vid长度问题

1 个赞

我们的研发同学在看了哈,有消息会更新帖子的哈。

想了解两个问题:

  1. 你的 vid 长度设置的是多少?
  2. 这个语句中的 ‘a:1005’ AS aId 是笔误吗?还是你执行的语句就长这样?正常语法应该是 aa as aId?
  1. 上面的创建语句,vid写了200
  2. 不是笔误啊,‘a:1005’ 这个是具体vid的值,将字符串’a:1005’赋值给aId

补充一下,如果vid长度是固定的30,可以执行成功,如图所示

了解,感谢,我们定位一下问题;

感谢,确定是个 bug,提了 issue:https://github.com/vesoft-inc/nebula/issues/4926 ,正在修

2 个赞