add multi-cluster support.

This commit is contained in:
kaichao 2023-04-02 14:24:27 +08:00
parent 82d9ed69af
commit 0870201274
10 changed files with 37 additions and 35 deletions

View File

@ -1,13 +1,12 @@
CLUSTER ?= local
clusters ?= local bio-down
include $(CLUSTER)/defs.mk
include defs.mk
# Get Local IPv4 addr
# ip route get 1.2.3.4 | awk '{print $7}'
# hostname -I | awk '{print $1}'
# hostname -i
# local_ip_index=4
# local_addr=$$(hostname -I | awk '{print $$1}')
# local_addr=$$(hostname -I| cut -d ' ' -f ${local_ip_index})
ifeq ($(origin LOCAL_ADDR), undefined)
ifeq ($(origin LOCAL_IP_INDEX), undefined)
@ -23,25 +22,23 @@ all: reset import
reset: down
@# docker ps -a|grep -v CONTAINER|awk '{print $$1}'| while read line; do docker rm -f $$line;done
@rm -rf ${HOME}/.scalebox/log/cli
@mkdir -p ${HOME}/.scalebox/log/cli
@docker volume rm -f cluster_pgdata
@CLUSTER=$(CLUSTER) docker-compose up -d
@sleep 20
@ rm -rf ${HOME}/.scalebox/log/cli
@ mkdir -p ${HOME}/.scalebox/log/cli
@ docker volume rm -f cluster_pgdata
@ docker-compose up -d
@ sleep 25
import:
@echo "LOCAL_ADDR is "$(LOCAL_ADDR)
@cd $(CLUSTER) && LOCAL_ADDR=$(LOCAL_ADDR) scalebox app create mycluster.yaml
@ echo "LOCAL_ADDR is "$(LOCAL_ADDR)
@ for c in $(clusters); do \
LOCAL_ADDR=$(LOCAL_ADDR) scalebox app create $$c/mycluster.yaml; \
done
up:
@CLUSTER=$(CLUSTER) docker-compose up -d
@ docker-compose up -d
down:
@echo cluster:$(CLUSTER)*
@echo shared_dir:$(SHARED_DIR)*
@echo nodes:$(NODES)*
@CLUSTER=$(CLUSTER) docker-compose down
@ docker-compose down
clean:
@ pdsh -w $(NODES) 'docker rm -f $$(docker ps -qa)' | dshbak -c
@ -55,5 +52,5 @@ list:
actuator-passwordless:
@ mkdir -p $(SHARED_DIR) && cp id_rsa.pub $(SHARED_DIR)
@ pdsh -w $(NODES) 'cat $(SHARED_DIR)/id_rsa.pub >> /root/.ssh/authorized_keys' | dshbak -c
@ pdsh -w $(NODES) 'cat $(SHARED_DIR)/id_rsa.pub >> ${HOME}/.ssh/authorized_keys' | dshbak -c
@ rm -f $(SHARED_DIR)/id_rsa.pub

View File

@ -1,5 +1,9 @@
# Scalebox集群
Scalebox集群是运行scalebox应用的硬件资源集合。分为
- 内联集群Inline Cluster由Scalebox直接管理的计算资源组成的集群。支持静态资源分配、动态资源分配两种方式。
- 外部集群External Cluster有外部调度系统管理的计算资源组成的集群可支持用slurm、k8s等。
## Scalebox集群介绍
### 头节点
头节点上安装了单个scalebox集群的管理服务主要包括
@ -7,6 +11,8 @@
- database基于postgresql的数据库存放app、job、task、slot等相关数据面向controld等提供数据存储、检索等服务。
- actuator启动端负责在计算节点上启动slot。
头节点以及头节点服务controld/actuator/database可为多个集群所共享。
### 计算节点
计算节点分为两类:
- 内部计算节点scalebox内部调度管理的计算节点通过免密ssh启动slot

View File

@ -1,5 +0,0 @@
CLUSTER := bio-down
SHARED_DIR := /gfs
NODES := n[0-3],h0

View File

@ -1,9 +1,8 @@
CLUSTER := local
SHARED_DIR := /gfs
NODES := h0
NODES := n[0-3],h0
# Customize local ip by defining LOCAL_IP_INDEX or LOCAL_ADDR
# hostname -I | awk '{print $1}'
# LOCAL_ADDR=127.0.0.1
# LOCAL_IP_INDEX=2

View File

@ -15,7 +15,6 @@ services:
ports:
- 50051:50051
environment:
- CLUSTER=${CLUSTER}
- PGHOST=${PGHOST}
- PGPORT=${PGPORT}
- LOG_LEVEL=WARN
@ -33,7 +32,6 @@ services:
max-size: "50m"
max-file: "5"
environment:
- CLUSTER=${CLUSTER}
- GRPC_SERVER=controld:50051
- PGHOST=${PGHOST}
- PGPORT=${PGPORT}
@ -58,7 +56,6 @@ services:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
volumes:
# - ${PWD}/PGDATA:/var/lib/postgresql/data
- pgdata:/var/lib/postgresql/data
- /etc/localtime:/etc/localtime
ports:

View File

@ -11,6 +11,8 @@ jobs:
schedule_mode: HEAD
parameters:
start_message: ANY
variables:
repeated: yes
environments:
- NUM_GROUPS=${NUM_GROUPS}
- GROUP_SIZE=${GROUP_SIZE}

View File

@ -0,0 +1,8 @@
CLUSTER=bio-down
GROUP_SIZE=10000
NUM_GROUPS=100
CALC_HOST=(n[0-3])|(h0)
NUM_PARALLEL=4

View File

@ -19,5 +19,5 @@ echo -n $count $sum > result.txt
# save the result to t_app, and set the status of the application to FINISHED
if [[ "$count" = "${NUM_GROUPS}" ]]; then
scalebox app set-finished -job-id=${JOB_ID} "Result is "${sum}
scalebox app set-finished --job-id=${JOB_ID} "Result is "${sum}
fi

View File

@ -1,10 +1,8 @@
# CLUSTER=local
CLUSTER=bio-down
CLUSTER=local
GROUP_SIZE=10000
NUM_GROUPS=100
# CALC_HOST=h0
CALC_HOST=(n[0-3])|(h0)
CALC_HOST=h0
NUM_PARALLEL=4
NUM_PARALLEL=1

View File

@ -3,6 +3,6 @@
echo "Input message:"$1
echo "Hello, $1!"
scalebox app set-finished -job-id=${JOB_ID} "Hello, Scalebox is OK!"
scalebox app set-finished --job-id=${JOB_ID} "Hello, Scalebox is OK!"
exit 0