求助，docker swarm集群部署失败

henson · 2020 年7 月 1 日 12:05

求助，docker-swarm 修改后部署失败，求官方大手支援
docker-swarm文件


    version: '3.4'
services:
  metad0:
    image: 192.168.1.161:80/common/nebula-metad:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.166
      - --ws_ip=192.168.1.166
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:11000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s

    ports:
      - target: 11000
        published: 11000
        protocol: tcp
        mode: host
      - target: 11002
        published: 11002
        protocol: tcp
        mode: host
      - target: 45500
        published: 45500
        protocol: tcp
        mode: host

    volumes:
      - ./data/meta0:/data/meta
      - ./logs/meta0:/logs
    networks:
      - nebula-net

  metad1:
    image: 192.168.1.161:80/common/nebula-metad:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.167
      - --ws_ip=192.168.1.167
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:11001/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 11000
        published: 11001
        protocol: tcp
        mode: host
      - target: 11002
        published: 11002
        protocol: tcp
        mode: host
      - target: 45500
        published: 45501
        protocol: tcp
        mode: host
    volumes:
      - ./data/meta1:/data/meta
      - ./logs/meta1:/logs
    networks:
      - nebula-net

  metad2:
    image: 192.168.1.161:80/common/nebula-metad:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.168
      - --ws_ip=192.168.1.168
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:11003/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 11000
        published: 11003
        protocol: tcp
        mode: host
      - target: 11002
        published: 11004
        protocol: tcp
        mode: host
      - target: 45500
        published: 45502
        protocol: tcp
        mode: host
    volumes:
      - ./data/meta2:/data/meta
      - ./logs/meta2:/logs
    networks:
      - nebula-net

  storaged0:
    image: 192.168.1.161:80/common/nebula-storaged:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.166
      - --ws_ip=192.168.1.166
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:12000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12000
        protocol: tcp
        mode: host
      - target: 12002
        published: 12002
        protocol: tcp
        mode: host
    volumes:
      - ./data/storage0:/data/storage
      - ./logs/storage0:/logs
    networks:
      - nebula-net

  storaged1:
    image: 192.168.1.161:80/common/nebula-storaged:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.167
      - --ws_ip=192.168.1.167
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:12003/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12003
        protocol: tcp
        mode: host
      - target: 12002
        published: 12004
        protocol: tcp
        mode: host
    volumes:
      - ./data/storage1:/data/storage
      - ./logs/storage1:/logs
    networks:
      - nebula-net

  storaged2:
    image: 192.168.1.161:80/common/nebula-storaged:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --local_ip=192.168.1.168
      - --ws_ip=192.168.1.168
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:12005/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12005
        protocol: tcp
        mode: host
      - target: 12002
        published: 12006
        protocol: tcp
        mode: host
    volumes:
      - ./data/storage2:/data/storage
      - ./logs/storage2:/logs
    networks:
      - nebula-net

  graphd:
    image: 192.168.1.161:80/common/nebula-graphd:v1.0
    environment:
      USER: root
      TZ:   "${TZ}"
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45501,192.168.1.168:45502
      - --port=3699
      - --ws_ip=192.168.1.166
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:13000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3699
        protocol: tcp
        mode: host
      - target: 13000
        published: 13000
        protocol: tcp
        mode: host
      - target: 13002
        published: 13002
        protocol: tcp
        mode: host
    volumes:
      - ./logs/graph:/logs
    networks:
      - nebula-net

networks:
  nebula-net:

问题截图

格式有点问题，欢迎联系
邮箱：henson_wu@foxmail.com
vx: wu88888888000

henson · 2020 年7 月 1 日 12:07

@dingding 求助

dingding · 2020 年7 月 1 日 15:22

服务配置的日志目录下面产生日志文件了吗？

henson · 2020 年7 月 1 日 15:33

@dingding 日志文件有的，可以留邮箱打包发给您,这里没法上传附件

yee · 2020 年7 月 2 日 01:35

nebula-net 的网络 mode 需要指定为 host，参考 network_mode

henson · 2020 年7 月 2 日 02:21

@yee @dingding 两位好，目前我指定了网络模式为host

storage报错如图
meta没打印出日志

yee · 2020 年7 月 2 日 02:39

我们在本地验证一下你的配置文件，然后再回复你

henson · 2020 年7 月 2 日 02:50

多谢

henson · 2020 年7 月 2 日 03:41

你好，我找了三台测试机，现场已复现，可以把密码发给您

dingding · 2020 年7 月 2 日 04:08

感谢您的协助，我们已经准备机器了。我们后续会提供swarm配置给用户直接用。

henson · 2020 年7 月 2 日 04:10

多谢

yee · 2020 年7 月 3 日 02:21

@henson

抱歉现在才回复。

docker swarm 的部署的样例已经调试通过，详细的方式见 nebula-docker-compose#docker-swarm 分支。

具体使用方式如下：

准备 docker swarm 集群，通过 docker swarm init/join 初始化每个 node，如果后面遇到端口等问题，可以注意一下每个节点的防火墙是否关闭。
将 swarm 集群中的每个 node 根据 ip 设置对应的 hostname（或者其他可以区分的 hostname）
clone 上述工程的分支：git clone --branch docker-swarm --single-branch --depth 1 https://github.com/vesoft-inc/nebula-docker-compose
将上述 docker-stack.yaml 中的 ip 改成你自己的节点 ip，并且 node.hostname 的 constraints 改成上述对应的 node 的 hostname
通过 docker stack 部署：docker stack deploy -c docker-stack.yaml nebula
查看 nebula 所有的 service： docker stack services nebula

上述中的具体思路就是，让每个 container 分别部署到对应的 node 上，同时使用该 node 上的 network，这样 nebula 在配置的时候就直接使用 host 的 ip 即可了。关键点在于 network 的类型。

部署中如果还遇到什么问题，欢迎随时给我们回复。

henson · 2020 年7 月 3 日 02:32

万分感谢

steam · 2020 年7 月 3 日 03:36

如果觉得 Yee 解决了你的问题，可以选择对应的回复为解决方案哈

1588736206143-a997a32e-840d-4421-ad14-a15e359319e3

henson · 2020 年7 月 3 日 04:16

您好，验证完毕后会给反馈

henson · 2020 年7 月 3 日 05:24

验证完毕，再次感谢

henson · 2020 年7 月 6 日 09:05

我按照文件又加了俩graphd，但是服务没启动成功

      graphd2:
    image: 192.168.1.161:80/common/nebula-graphd:v1.0.0
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --port=3640
      - --ws_ip=192.168.1.167
      - --log_dir=/logs
      - --v=2
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:13001/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3640
        protocol: tcp
        mode: host
      - target: 13000
        published: 13001
        protocol: tcp
        mode: host
      - target: 13002
        published: 13003
        protocol: tcp
        mode: host
    volumes:
      - logs-graphd2:/logs
    networks:
      - nebula-net

  graphd3:
    image: 192.168.1.161:80/common/nebula-graphd:v1.0.0
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --port=3641
      - --ws_ip=192.168.1.168
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:13002/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3641
        protocol: tcp
        mode: host
      - target: 13000
        published: 13002
        protocol: tcp
        mode: host
      - target: 13002
        published: 13004
        protocol: tcp
        mode: host
    volumes:
      - logs-graphd3:/logs
    networks:
      - nebula-net

knightXun · 2020 年7 月 6 日 12:19

建议检查一下部署的环境，比如是否有端口被占用等

henson · 2020 年7 月 6 日 13:21

多谢，已验证解决,希望已验证修改能合到github代码仓库（生产环境：graphd服务数量为3个）

yee · 2020 年7 月 7 日 00:14

抱歉昨天请假未能及时回复。

上次的 stack 没有合入是因为一些配置跟具体的环境有关，我们后续考虑一下如何将其做成配置的方式。多谢您的建议。