Compare commits

...

7 Commits

Author SHA1 Message Date
tzwang 9c6b3801fd updated shuguang imageinfer func 2024-09-20 17:49:28 +08:00
zhangweiii 349b9660b1 GeoMap
Former-commit-id: 5383e3c264
2024-09-20 09:12:52 +08:00
zhangwei 3e11a1789b 集群添加坐标信息
Former-commit-id: 822ae45b3a
2024-09-19 19:56:13 +08:00
zhangwei 8e33750179 Merge branch 'master' of https://gitlink.org.cn/JointCloud/pcm-coordinator
Former-commit-id: cead65aae1
2024-09-19 09:07:31 +08:00
qiwang 0003d147a0 fix: update modelarts ServiceName now.Unix of ai 2024-09-19 08:45:37 +08:00
tzwang 32bc8b11fe Merge pull request 'updated cron function' (#313) from tzwang/pcm-coordinator:master into master 2024-09-18 17:37:48 +08:00
zhangwei 5cdb491d05 Merge branch 'master' of https://gitlink.org.cn/JointCloud/pcm-coordinator
# Conflicts:
#	Dockerfile
#	etc/pcm.yaml
#	go.mod
#	go.sum
#	internal/logic/inference/deployinstancelistlogic.go
#	internal/logic/inference/getdeploytasksbytypelogic.go
#	internal/logic/inference/startallbydeploytaskidlogic.go
#	internal/logic/inference/stopallbydeploytaskidlogic.go
#	internal/storeLink/modelarts.go
#	internal/storeLink/storeLink.go


Former-commit-id: 6bcef559a9
2024-09-12 17:03:39 +08:00
10 changed files with 237 additions and 75 deletions

View File

@ -858,7 +858,7 @@ type (
Token string `json:"token,optional"`
Ak string `json:"ak,optional"`
Sk string `json:"sk,optional"`
Region string `json:"region,optional"`
Region []string `json:"region,optional"`
ProjectId string `json:"projectId,optional"`
Version string `json:"version,optional"`
Label string `json:"label,optional"`
@ -866,6 +866,7 @@ type (
AuthType string `json:"authType,optional"`
ProducerDict string `json:"producerDict,optional"`
RegionDict string `json:"regionDict,optional"`
RegionName string `json:"regionName,optional"`
}
ClusterInfo {
Id string `json:"id,omitempty" db:"id"`
@ -888,6 +889,7 @@ type (
AuthType string `json:"authType,omitempty" db:"auth_type"`
ProducerDict string `json:"producerDict,omitempty" db:"producer_dict"`
RegionDict string `json:"regionDict,omitempty" db:"region_dict"`
Location string `json:"location,omitempty" db:"location"`
CreateTime string `json:"createTime,omitempty" db:"created_time" gorm:"autoCreateTime"`
}
)

4
go.mod
View File

@ -18,8 +18,8 @@ require (
github.com/prometheus/common v0.59.1
github.com/robfig/cron/v3 v3.0.1
github.com/zeromicro/go-zero v1.7.2
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240909072501-939c3144cd9e
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5

8
go.sum
View File

@ -466,10 +466,10 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
github.com/zeromicro/go-zero v1.7.2 h1:a8lyVOG3KXG4LrAy6ZmtJTJtisX4Ostc4Pst4fE704I=
github.com/zeromicro/go-zero v1.7.2/go.mod h1:WFXfF92Exw0O7WECifS6r99JSzv4KEN49x9RhAfgkMc=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437 h1:ta6h9+FU7AQ2fNyQiXrZnMdlNBjOKdyBx4e3RF7BE84=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240909072501-939c3144cd9e h1:6LYJggBoeAQxy/otzWjt40Pa7gnVvUR4c5YMi6A/NdU=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240909072501-939c3144cd9e/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185 h1:B+YBB5xHlIAS6ILuaCGQwbOpr/L6LOHAlj9PeFUCetM=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877 h1:a+1FpxqLPRojlAkJlAeRhKRbxajymXYgrM+s9bfQx0E=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 h1:GaXwr5sgDh0raHjUf9IewTvnRvajYea7zbLsaerYyXo=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=

View File

@ -3,12 +3,16 @@ package adapters
import (
"context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm"
"time"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm"
"io/ioutil"
"k8s.io/apimachinery/pkg/util/json"
"net/http"
"net/url"
"time"
"github.com/zeromicro/go-zero/core/logx"
)
@ -33,17 +37,27 @@ func (l *CreateClusterLogic) CreateCluster(req *types.ClusterCreateReq) (resp *t
if errors.Is(result.Error, gorm.ErrRecordNotFound) {
return nil, errors.New("adapter does not exist")
}
cluster := types.ClusterInfo{}
utils.Convert(req, &cluster)
cluster.Id = utils.GenSnowflakeIDStr()
tool.Convert(req, &cluster)
cluster.CreateTime = time.Now().Format("2006-01-02 15:04:05")
cluster.OwnerId = "0"
// 获取集群经纬度
location, err := GeoMap(req.RegionName)
cluster.Location = location
cluster.Id = tool.GenSnowflakeIDStr()
tx := l.svcCtx.DbEngin.Table("t_cluster").Create(&cluster)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, errors.New("cluster create failed")
}
if err != nil {
return nil, err
}
// push cluster info to adapter
var adapterServer string
l.svcCtx.DbEngin.Raw("select server from t_adapter where id = ?", req.AdapterId).Scan(&adapterServer)
@ -64,3 +78,63 @@ func (l *CreateClusterLogic) CreateCluster(req *types.ClusterCreateReq) (resp *t
}
return
}
func GeoMap(address string) (string, error) {
// 此处填写您在控制台-应用管理-创建应用后获取的AK
ak := "d3cc9eee0266d39a52498726d1b82f87"
// 接口地址
uri := "https://restapi.amap.com/v3/geocode/geo"
// 设置请求参数
params := url.Values{
"address": []string{address},
"output": []string{"json"},
"key": []string{ak},
}
// 发起请求
request, err := url.Parse(uri + "?" + params.Encode())
if nil != err {
fmt.Printf("host error: %v", err)
return "", err
}
resp, err1 := http.Get(request.String())
fmt.Printf("url: %s\n", request.String())
defer resp.Body.Close()
if err1 != nil {
fmt.Printf("request error: %v", err1)
return "", err
}
body, err2 := ioutil.ReadAll(resp.Body)
if err2 != nil {
fmt.Printf("response error: %v", err2)
}
fmt.Println(string(body))
geoResponse := GeoResponse{}
json.Unmarshal(body, &geoResponse)
return geoResponse.Geocodes[0].Location, err
}
type GeoResponse struct {
Status string `json:"status"`
Info string `json:"info"`
InfoCode string `json:"infocode"`
Count string `json:"count"`
Geocodes []GeoCode `json:"geocodes"`
}
type GeoCode struct {
FormattedAddress string `json:"formatted_address"`
Country string `json:"country"`
Province string `json:"province"`
CityCode string `json:"citycode"`
City string `json:"city"`
District string `json:"district"`
Adcode string `json:"adcode"`
Number string `json:"number"`
Location string `json:"location"`
Level string `json:"level"`
}

View File

@ -463,7 +463,7 @@ func getInferResult(url string, file multipart.File, fileName string, clusterId
switch clusterType {
case storeLink.TYPE_OCTOPUS:
r := http.Request{}
result, err := iCluster.GetInferResult(r.Context(), url, file, fileName)
result, err := iCluster.GetImageInferResult(r.Context(), url, file, fileName)
if err != nil {
return "", err
}

View File

@ -12,13 +12,17 @@ const (
type ICluster interface {
GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*ClusterInferUrl, error)
GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error)
GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error)
StartInferDeployInstance(ctx context.Context, id string) bool
StopInferDeployInstance(ctx context.Context, id string) bool
GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error)
CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error)
CheckModelExistence(ctx context.Context, modelName string, modelType string) bool
InferResult
}
type InferResult interface {
GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error)
}
type IInference interface {

View File

@ -767,7 +767,7 @@ func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (
return ins, nil
}
func (m *ModelArtsLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
return "", nil
}
@ -791,11 +791,14 @@ func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *o
}
var configItems []*modelarts.ServiceConfig
configItems = append(configItems, configParam)
now := time.Now()
timestampSec := now.Unix()
str := strconv.FormatInt(timestampSec, 10)
req := &modelarts.CreateServiceReq{
Platform: m.platform,
Config: configItems,
InferType: "real-time",
ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu,
ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
}
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
defer cancel()

View File

@ -1183,7 +1183,7 @@ func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*i
return ins, nil
}
func (o *OctopusLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
func (o *OctopusLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
stream, err := o.octopusRpc.GetInferResult(ctx)
if err != nil {
return "", err

View File

@ -47,49 +47,60 @@ const (
PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm"
KUNSHAN_DIR = "/public/home/acgnnmfbwo/pcmv1"
TRAIN_FILE = "train.py"
CPUCOREPRICEPERHOUR = 0.09
DCUPRICEPERHOUR = 2.0
KB = 1024
TIMEOUT = 20
DEPLOY_INSTANCE_LIMIT = 100
ProtocolType = "HTTP"
ContainerPort = 8881
JUPYTER = "jupyter"
)
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
CPU: 1,
GPU: 1,
RAM: 2 * RAM_SIZE_1G,
},
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": {
CPU: 1,
GPU: 2,
RAM: 2 * RAM_SIZE_1G,
},
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
CPU: 2,
GPU: 3,
RAM: 4 * RAM_SIZE_1G,
},
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
CPU: 4,
GPU: 4,
RAM: 8 * RAM_SIZE_1G,
},
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
CPU: 5,
GPU: 5,
RAM: 10 * RAM_SIZE_1G,
},
}
var (
RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
CPU: 1,
GPU: 1,
RAM: 2 * RAM_SIZE_1G,
},
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": {
CPU: 1,
GPU: 2,
RAM: 2 * RAM_SIZE_1G,
},
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
CPU: 2,
GPU: 3,
RAM: 4 * RAM_SIZE_1G,
},
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
CPU: 4,
GPU: 4,
RAM: 8 * RAM_SIZE_1G,
},
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
CPU: 5,
GPU: 5,
RAM: 10 * RAM_SIZE_1G,
},
}
var RESOURCESPECSAI = map[string]string{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
}
RESOURCESPECSAI = map[string]string{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
}
ModelNameCmdMap = map[string]string{
"blip-image-captioning-base": "pip install transformers python-multipart fastapi uvicorn[standard]; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/blip_image_captioning_base/infer.py",
"imagenet_resnet50": "pip install fastapi uvicorn[standard] python-multipart; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/imagenet_resnet50/infer.py",
}
)
type ResourceSpecSGAI struct {
CPU int64
@ -905,15 +916,81 @@ func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*in
return ins, nil
}
func (s *ShuguangAi) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
func (s *ShuguangAi) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
return "", nil
}
func (s *ShuguangAi) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
containerPortInfoList := []*hpcAC.ContainerPortInfoList{
{
ProtocolType: ProtocolType,
ContainerPort: ContainerPort,
},
}
return "", nil
desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(DCU)
instanceServiceName := "infer_instance" + UNDERSCORE + utils.RandomString(15)
resourceGroup := "kshdtest"
script, ok := ModelNameCmdMap[option.ModelName]
if !ok {
return "", errors.New("failed to set cmd, ModelName not exist")
}
param := &hpcAC.CreateParams{
AcceleratorType: strings.ToLower(DCU),
ContainerPortInfoList: containerPortInfoList,
CpuNumber: 8,
Description: desc,
//env
GpuNumber: 1,
ImagePath: "11.11.100.6:5000/dcu/admin/base/jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6",
InstanceServiceName: instanceServiceName,
MountInfoList: make([]*hpcAC.MountInfoList, 0),
//originalVersion
RamSize: 10 * RAM_SIZE_1G,
//rdma
ResourceGroup: resourceGroup,
StartScriptActionScope: "all",
StartScriptContent: script,
//startServiceCommand
//taskClassification: "interactive"
TaskNumber: 1,
TaskType: JUPYTER,
TimeoutLimit: "01:00:00",
UseStartScript: true,
//useStartServiceCommand: false
Version: "jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6",
}
req := &hpcacclient.CreateInstanceServiceReq{
Data: param,
}
resp, err := s.aCRpc.CreateInstanceService(ctx, req)
if err != nil {
return "", err
}
if resp.Code != "0" {
return "", errors.New(resp.Msg)
}
return resp.Data, nil
}
func (s *ShuguangAi) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
return false
modelPath := "model" + FORWARD_SLASH + name
req := &hpcAC.IsExistFileReq{
Path: KUNSHAN_DIR + FORWARD_SLASH + modelPath,
}
resp, err := s.aCRpc.IsExistFile(ctx, req)
if err != nil {
return false
}
if resp.Code != "0" || resp.Data == nil {
return false
}
return resp.Data.Exist
}

View File

@ -779,26 +779,27 @@ type ClusterReq struct {
}
type ClusterCreateReq struct {
Id string `json:"id,optional"`
AdapterId string `json:"adapterId,optional"`
Name string `json:"name,optional"`
Nickname string `json:"nickname,optional"`
Description string `json:"description,optional"`
Server string `json:"server,optional"`
MonitorServer string `json:"monitorServer,optional"`
Username string `json:"username,optional"`
Password string `json:"password,optional"`
Token string `json:"token,optional"`
Ak string `json:"ak,optional"`
Sk string `json:"sk,optional"`
Region string `json:"region,optional"`
ProjectId string `json:"projectId,optional"`
Version string `json:"version,optional"`
Label string `json:"label,optional"`
OwnerId string `json:"ownerId,omitempty,optional"`
AuthType string `json:"authType,optional"`
ProducerDict string `json:"producerDict,optional"`
RegionDict string `json:"regionDict,optional"`
Id string `json:"id,optional"`
AdapterId string `json:"adapterId,optional"`
Name string `json:"name,optional"`
Nickname string `json:"nickname,optional"`
Description string `json:"description,optional"`
Server string `json:"server,optional"`
MonitorServer string `json:"monitorServer,optional"`
Username string `json:"username,optional"`
Password string `json:"password,optional"`
Token string `json:"token,optional"`
Ak string `json:"ak,optional"`
Sk string `json:"sk,optional"`
Region []string `json:"region,optional"`
ProjectId string `json:"projectId,optional"`
Version string `json:"version,optional"`
Label string `json:"label,optional"`
OwnerId string `json:"ownerId,omitempty,optional"`
AuthType string `json:"authType,optional"`
ProducerDict string `json:"producerDict,optional"`
RegionDict string `json:"regionDict,optional"`
RegionName string `json:"regionName,optional"`
}
type ClusterInfo struct {
@ -822,6 +823,7 @@ type ClusterInfo struct {
AuthType string `json:"authType,omitempty" db:"auth_type"`
ProducerDict string `json:"producerDict,omitempty" db:"producer_dict"`
RegionDict string `json:"regionDict,omitempty" db:"region_dict"`
Location string `json:"location,omitempty" db:"location"`
CreateTime string `json:"createTime,omitempty" db:"created_time" gorm:"autoCreateTime"`
}