add multi-cluster support.
This commit is contained in:
parent
82d9ed69af
commit
0870201274
|
@ -1,13 +1,12 @@
|
|||
CLUSTER ?= local
|
||||
clusters ?= local bio-down
|
||||
|
||||
include $(CLUSTER)/defs.mk
|
||||
include defs.mk
|
||||
|
||||
# Get Local IPv4 addr
|
||||
# ip route get 1.2.3.4 | awk '{print $7}'
|
||||
# hostname -I | awk '{print $1}'
|
||||
# hostname -i
|
||||
# local_ip_index=4
|
||||
# local_addr=$$(hostname -I | awk '{print $$1}')
|
||||
# local_addr=$$(hostname -I| cut -d ' ' -f ${local_ip_index})
|
||||
ifeq ($(origin LOCAL_ADDR), undefined)
|
||||
ifeq ($(origin LOCAL_IP_INDEX), undefined)
|
||||
|
@ -23,25 +22,23 @@ all: reset import
|
|||
|
||||
reset: down
|
||||
@# docker ps -a|grep -v CONTAINER|awk '{print $$1}'| while read line; do docker rm -f $$line;done
|
||||
@rm -rf ${HOME}/.scalebox/log/cli
|
||||
@mkdir -p ${HOME}/.scalebox/log/cli
|
||||
@docker volume rm -f cluster_pgdata
|
||||
@CLUSTER=$(CLUSTER) docker-compose up -d
|
||||
@sleep 20
|
||||
@ rm -rf ${HOME}/.scalebox/log/cli
|
||||
@ mkdir -p ${HOME}/.scalebox/log/cli
|
||||
@ docker volume rm -f cluster_pgdata
|
||||
@ docker-compose up -d
|
||||
@ sleep 25
|
||||
|
||||
import:
|
||||
@echo "LOCAL_ADDR is "$(LOCAL_ADDR)
|
||||
@cd $(CLUSTER) && LOCAL_ADDR=$(LOCAL_ADDR) scalebox app create mycluster.yaml
|
||||
@ echo "LOCAL_ADDR is "$(LOCAL_ADDR)
|
||||
@ for c in $(clusters); do \
|
||||
LOCAL_ADDR=$(LOCAL_ADDR) scalebox app create $$c/mycluster.yaml; \
|
||||
done
|
||||
|
||||
up:
|
||||
@CLUSTER=$(CLUSTER) docker-compose up -d
|
||||
@ docker-compose up -d
|
||||
|
||||
down:
|
||||
@echo cluster:$(CLUSTER)*
|
||||
@echo shared_dir:$(SHARED_DIR)*
|
||||
@echo nodes:$(NODES)*
|
||||
|
||||
@CLUSTER=$(CLUSTER) docker-compose down
|
||||
@ docker-compose down
|
||||
|
||||
clean:
|
||||
@ pdsh -w $(NODES) 'docker rm -f $$(docker ps -qa)' | dshbak -c
|
||||
|
@ -55,5 +52,5 @@ list:
|
|||
|
||||
actuator-passwordless:
|
||||
@ mkdir -p $(SHARED_DIR) && cp id_rsa.pub $(SHARED_DIR)
|
||||
@ pdsh -w $(NODES) 'cat $(SHARED_DIR)/id_rsa.pub >> /root/.ssh/authorized_keys' | dshbak -c
|
||||
@ pdsh -w $(NODES) 'cat $(SHARED_DIR)/id_rsa.pub >> ${HOME}/.ssh/authorized_keys' | dshbak -c
|
||||
@ rm -f $(SHARED_DIR)/id_rsa.pub
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# Scalebox集群
|
||||
|
||||
Scalebox集群是运行scalebox应用的硬件资源集合。分为
|
||||
- 内联集群(Inline Cluster):由Scalebox直接管理的计算资源组成的集群。支持静态资源分配、动态资源分配两种方式。
|
||||
- 外部集群(External Cluster):有外部调度系统管理的计算资源组成的集群,可支持用slurm、k8s等。
|
||||
|
||||
## Scalebox集群介绍
|
||||
### 头节点
|
||||
头节点上安装了单个scalebox集群的管理服务,主要包括:
|
||||
|
@ -7,6 +11,8 @@
|
|||
- database:基于postgresql的数据库,存放app、job、task、slot等相关数据,面向controld等提供数据存储、检索等服务。
|
||||
- actuator:启动端,负责在计算节点上启动slot。
|
||||
|
||||
头节点以及头节点服务(controld/actuator/database)可为多个集群所共享。
|
||||
|
||||
### 计算节点
|
||||
计算节点分为两类:
|
||||
- 内部计算节点:scalebox内部调度管理的计算节点,通过免密ssh启动slot;
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
CLUSTER := bio-down
|
||||
|
||||
SHARED_DIR := /gfs
|
||||
|
||||
NODES := n[0-3],h0
|
|
@ -1,9 +1,8 @@
|
|||
CLUSTER := local
|
||||
|
||||
SHARED_DIR := /gfs
|
||||
|
||||
NODES := h0
|
||||
NODES := n[0-3],h0
|
||||
|
||||
# Customize local ip by defining LOCAL_IP_INDEX or LOCAL_ADDR
|
||||
# hostname -I | awk '{print $1}'
|
||||
# LOCAL_ADDR=127.0.0.1
|
||||
# LOCAL_IP_INDEX=2
|
|
@ -15,7 +15,6 @@ services:
|
|||
ports:
|
||||
- 50051:50051
|
||||
environment:
|
||||
- CLUSTER=${CLUSTER}
|
||||
- PGHOST=${PGHOST}
|
||||
- PGPORT=${PGPORT}
|
||||
- LOG_LEVEL=WARN
|
||||
|
@ -33,7 +32,6 @@ services:
|
|||
max-size: "50m"
|
||||
max-file: "5"
|
||||
environment:
|
||||
- CLUSTER=${CLUSTER}
|
||||
- GRPC_SERVER=controld:50051
|
||||
- PGHOST=${PGHOST}
|
||||
- PGPORT=${PGPORT}
|
||||
|
@ -58,7 +56,6 @@ services:
|
|||
- POSTGRES_USER=${POSTGRES_USER}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||
volumes:
|
||||
# - ${PWD}/PGDATA:/var/lib/postgresql/data
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
- /etc/localtime:/etc/localtime
|
||||
ports:
|
||||
|
|
|
@ -11,6 +11,8 @@ jobs:
|
|||
schedule_mode: HEAD
|
||||
parameters:
|
||||
start_message: ANY
|
||||
variables:
|
||||
repeated: yes
|
||||
environments:
|
||||
- NUM_GROUPS=${NUM_GROUPS}
|
||||
- GROUP_SIZE=${GROUP_SIZE}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
CLUSTER=bio-down
|
||||
|
||||
GROUP_SIZE=10000
|
||||
NUM_GROUPS=100
|
||||
|
||||
CALC_HOST=(n[0-3])|(h0)
|
||||
|
||||
NUM_PARALLEL=4
|
|
@ -19,5 +19,5 @@ echo -n $count $sum > result.txt
|
|||
|
||||
# save the result to t_app, and set the status of the application to FINISHED
|
||||
if [[ "$count" = "${NUM_GROUPS}" ]]; then
|
||||
scalebox app set-finished -job-id=${JOB_ID} "Result is "${sum}
|
||||
scalebox app set-finished --job-id=${JOB_ID} "Result is "${sum}
|
||||
fi
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
# CLUSTER=local
|
||||
CLUSTER=bio-down
|
||||
CLUSTER=local
|
||||
|
||||
GROUP_SIZE=10000
|
||||
NUM_GROUPS=100
|
||||
|
||||
# CALC_HOST=h0
|
||||
CALC_HOST=(n[0-3])|(h0)
|
||||
CALC_HOST=h0
|
||||
|
||||
NUM_PARALLEL=4
|
||||
NUM_PARALLEL=1
|
||||
|
|
|
@ -3,6 +3,6 @@
|
|||
echo "Input message:"$1
|
||||
echo "Hello, $1!"
|
||||
|
||||
scalebox app set-finished -job-id=${JOB_ID} "Hello, Scalebox is OK!"
|
||||
scalebox app set-finished --job-id=${JOB_ID} "Hello, Scalebox is OK!"
|
||||
|
||||
exit 0
|
||||
|
|
Loading…
Reference in New Issue