Merge remote-tracking branch 'origin/master'

# Conflicts:
#	go.mod
This commit is contained in:
qiwang 2024-07-25 17:19:40 +08:00
commit 8bbb72142d
25 changed files with 548 additions and 74 deletions

View File

@ -1738,6 +1738,11 @@ PayloadCreateTrainJob{
Card string `json:"card,optional"`
TimeElapsed int32 `json:"elapsed,optional"`
}
TrainingTaskStatResp {
Running int32 `json:"running"`
Total int32 `json:"total"`
}
)
/******************create TrainIngJob end*************************/

View File

@ -88,7 +88,8 @@ type (
StartDeployInstanceReq {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
InstanceId string `form:"id"`
Id string `form:"id"`
InstanceId string `form:"instanceId"`
}
StartDeployInstanceResp {
@ -97,10 +98,26 @@ type (
StopDeployInstanceReq {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
InstanceId string `form:"id"`
Id string `form:"id"`
InstanceId string `form:"instanceId"`
}
StopDeployInstanceResp {
}
DeployInstanceStatReq {
}
DeployInstanceStatResp {
Running int32 `json:"running"`
Total int32 `json:"total"`
}
InferenceTaskStatReq {}
InferenceTaskStatResp {
Running int32 `json:"running"`
Total int32 `json:"total"`
}
)

View File

@ -244,6 +244,10 @@ service pcm {
group: ai
)
service pcm {
@doc "训练任务统计"
@handler trainingTaskStatHandler
get /ai/trainingTaskStat returns (TrainingTaskStatResp)
@doc "智算中心概览"
@handler getCenterOverviewHandler
get /ai/getCenterOverview returns (CenterOverviewResp)
@ -949,6 +953,12 @@ service pcm {
@handler StopDeployInstanceHandler
post /inference/stopDeployInstance (StopDeployInstanceReq) returns (StopDeployInstanceResp)
@handler DeployInstanceStatHandler
get /inference/deployInstanceStat (DeployInstanceStatReq) returns (DeployInstanceStatResp)
@handler InferenceTaskStatHandler
get /inference/taskStat (InferenceTaskStatReq) returns (InferenceTaskStatResp)
}
@server(

4
go.mod
View File

@ -18,8 +18,8 @@ require (
github.com/prometheus/common v0.55.0
github.com/robfig/cron/v3 v3.0.1
github.com/zeromicro/go-zero v1.6.6
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240724095608-1727d09f030c
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240725071305-f751eec4dde1
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240722092017-50d17f36d023
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5

4
go.sum
View File

@ -495,8 +495,8 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
github.com/zeromicro/go-zero v1.6.6 h1:nZTVYObklHiBdYJ/nPoAZ8kGVAplWSDjT7DGE7ur0uk=
github.com/zeromicro/go-zero v1.6.6/go.mod h1:olKf1/hELbSmuIgLgJeoeNVp3tCbLqj6UmO7ATSta4A=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 h1:bHJGq5P+8w4fP62PZhIiq/fvOhvDPRtkM4pcmU8OZ1w=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240725071305-f751eec4dde1 h1:DicBXoQiC6mumMBeyqSPNrsjtqJIgk5Pv2hscu2xryw=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240725071305-f751eec4dde1/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240722092017-50d17f36d023 h1:9DNobl0gvRCSXtjyMsfUwq0w0TMvds4rqNRsEqeX4j8=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240722092017-50d17f36d023/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240724095608-1727d09f030c h1:CodJeGgTYJwA6NDHFnw6B+4pBXUl79tvAcECq39tgZI=

View File

@ -0,0 +1,17 @@
package ai
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/ai"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
)
func TrainingTaskStatHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := ai.NewTrainingTaskStatLogic(r.Context(), svcCtx)
resp, err := l.TrainingTaskStat()
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,28 +1,24 @@
package inference
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"jcc-coordinator/internal/logic/inference"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func DeployInstanceListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.DeployInstanceListReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewDeployInstanceListLogic(r.Context(), svcCtx)
resp, err := l.DeployInstanceList(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -0,0 +1,26 @@
package inference
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
)
func DeployInstanceStatHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.DeployInstanceStatReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewDeployInstanceStatLogic(r.Context(), svcCtx)
resp, err := l.DeployInstanceStat(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -0,0 +1,25 @@
package inference
import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func InferenceTaskStatHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.InferenceTaskStatReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewInferenceTaskStatLogic(r.Context(), svcCtx)
resp, err := l.InferenceTaskStat(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,28 +1,24 @@
package inference
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"jcc-coordinator/internal/logic/inference"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func StartDeployInstanceListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.StartDeployInstanceReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewStartDeployInstanceListLogic(r.Context(), svcCtx)
resp, err := l.StartDeployInstanceList(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,28 +1,24 @@
package inference
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"jcc-coordinator/internal/logic/inference"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func StopDeployInstanceHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.StopDeployInstanceReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewStopDeployInstanceLogic(r.Context(), svcCtx)
resp, err := l.StopDeployInstance(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -287,6 +287,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
server.AddRoutes(
[]rest.Route{
{
Method: http.MethodGet,
Path: "/ai/trainingTaskStat",
Handler: ai.TrainingTaskStatHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/ai/getCenterOverview",
@ -1198,6 +1203,16 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/inference/stopDeployInstance",
Handler: inference.StopDeployInstanceHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/inference/deployInstanceStat",
Handler: inference.DeployInstanceStatHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/inference/taskStat",
Handler: inference.InferenceTaskStatHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)

View File

@ -0,0 +1,47 @@
package ai
import (
"context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type TrainingTaskStatLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewTrainingTaskStatLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TrainingTaskStatLogic {
return &TrainingTaskStatLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *TrainingTaskStatLogic) TrainingTaskStat() (resp *types.TrainingTaskStatResp, err error) {
resp = &types.TrainingTaskStatResp{}
total, err := l.svcCtx.Scheduler.AiStorages.GetTrainingTaskTotalNum()
if err != nil {
return nil, err
}
running, err := l.svcCtx.Scheduler.AiStorages.GetTrainingTaskRunningNum()
if err != nil {
return nil, err
}
if total == 0 {
return nil, errors.New("get statistics failed")
}
resp.Total = total
resp.Running = running
return resp, nil
}

View File

@ -2,11 +2,11 @@ package inference
import (
"context"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"errors"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
)
type DeployInstanceListLogic struct {
@ -24,7 +24,35 @@ func NewDeployInstanceListLogic(ctx context.Context, svcCtx *svc.ServiceContext)
}
func (l *DeployInstanceListLogic) DeployInstanceList(req *types.DeployInstanceListReq) (resp *types.DeployInstanceListResp, err error) {
// todo: add your logic here and delete this line
limit := req.PageSize
offset := req.PageSize * (req.PageNum - 1)
resp = &types.DeployInstanceListResp{}
var list []*models.AiInferDeployInstance
tx := l.svcCtx.DbEngin.Raw("select * from ai_infer_deploy_instance").Scan(&list)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, tx.Error
}
//count total
var total int64
err = tx.Count(&total).Error
tx.Limit(limit).Offset(offset)
if err != nil {
return resp, err
}
err = tx.Order("create_time desc").Find(&list).Error
if err != nil {
return nil, errors.New(err.Error())
}
resp.List = &list
resp.PageSize = req.PageSize
resp.PageNum = req.PageNum
resp.Total = total
return
}

View File

@ -0,0 +1,47 @@
package inference
import (
"context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type DeployInstanceStatLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewDeployInstanceStatLogic(ctx context.Context, svcCtx *svc.ServiceContext) *DeployInstanceStatLogic {
return &DeployInstanceStatLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *DeployInstanceStatLogic) DeployInstanceStat(req *types.DeployInstanceStatReq) (resp *types.DeployInstanceStatResp, err error) {
resp = &types.DeployInstanceStatResp{}
total, err := l.svcCtx.Scheduler.AiStorages.GetInferDeployInstanceTotalNum()
if err != nil {
return nil, err
}
running, err := l.svcCtx.Scheduler.AiStorages.GetInferDeployInstanceRunningNum()
if err != nil {
return nil, err
}
if total == 0 {
return nil, errors.New("get statistics failed")
}
resp.Total = total
resp.Running = running
return resp, nil
}

View File

@ -0,0 +1,47 @@
package inference
import (
"context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type InferenceTaskStatLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewInferenceTaskStatLogic(ctx context.Context, svcCtx *svc.ServiceContext) *InferenceTaskStatLogic {
return &InferenceTaskStatLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *InferenceTaskStatLogic) InferenceTaskStat(req *types.InferenceTaskStatReq) (resp *types.InferenceTaskStatResp, err error) {
resp = &types.InferenceTaskStatResp{}
total, err := l.svcCtx.Scheduler.AiStorages.GetInferenceTaskTotalNum()
if err != nil {
return nil, err
}
running, err := l.svcCtx.Scheduler.AiStorages.GetInferenceTaskRunningNum()
if err != nil {
return nil, err
}
if total == 0 {
return nil, errors.New("get statistics failed")
}
resp.Total = total
resp.Running = running
return resp, nil
}

View File

@ -2,11 +2,11 @@ package inference
import (
"context"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"errors"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"strconv"
)
type StartDeployInstanceListLogic struct {
@ -24,7 +24,19 @@ func NewStartDeployInstanceListLogic(ctx context.Context, svcCtx *svc.ServiceCon
}
func (l *StartDeployInstanceListLogic) StartDeployInstanceList(req *types.StartDeployInstanceReq) (resp *types.StartDeployInstanceResp, err error) {
// todo: add your logic here and delete this line
resp = &types.StartDeployInstanceResp{}
success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[req.AdapterId][req.ClusterId].StartInferDeployInstance(l.ctx, req.InstanceId)
id, err := strconv.ParseInt(req.Id, 10, 64)
ins, err := l.svcCtx.Scheduler.AiStorages.GetInferDeployInstanceById(id)
if err != nil {
return nil, err
}
return
l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins)
if !success {
return nil, errors.New("start instance failed")
}
return resp, nil
}

View File

@ -2,11 +2,11 @@ package inference
import (
"context"
"jcc-coordinator/internal/svc"
"jcc-coordinator/internal/types"
"errors"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"strconv"
)
type StopDeployInstanceLogic struct {
@ -24,7 +24,18 @@ func NewStopDeployInstanceLogic(ctx context.Context, svcCtx *svc.ServiceContext)
}
func (l *StopDeployInstanceLogic) StopDeployInstance(req *types.StopDeployInstanceReq) (resp *types.StopDeployInstanceResp, err error) {
// todo: add your logic here and delete this line
resp = &types.StopDeployInstanceResp{}
success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[req.AdapterId][req.ClusterId].StopInferDeployInstance(l.ctx, req.InstanceId)
id, err := strconv.ParseInt(req.Id, 10, 64)
ins, err := l.svcCtx.Scheduler.AiStorages.GetInferDeployInstanceById(id)
if err != nil {
return nil, err
}
return
l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins)
if !success {
return nil, errors.New("stop instance failed")
}
return resp, nil
}

View File

@ -373,19 +373,20 @@ func (s *AiStorage) AddNoticeInfo(adapterId string, adapterName string, clusterI
}
}
func (s *AiStorage) SaveInferDeployInstance() (int64, error) {
func (s *AiStorage) SaveInferDeployInstance(instanceId string, instanceName string, adapterId int64,
adapterName string, clusterId int64, clusterName string, modelName string, modelType string, inferCard string) (int64, error) {
startTime := time.Now().Format(time.RFC3339)
// 构建主任务结构体
insModel := models.AiInferDeployInstance{
InstanceId: "",
InstanceName: "",
AdapterId: 123,
AdapterName: "",
ClusterId: 123,
ClusterName: "",
ModelName: "",
ModelType: "",
InferCard: "",
InstanceId: instanceId,
InstanceName: instanceName,
AdapterId: adapterId,
AdapterName: adapterName,
ClusterId: clusterId,
ClusterName: clusterName,
ModelName: modelName,
ModelType: modelType,
InferCard: inferCard,
Status: constants.Saved,
CreateTime: startTime,
UpdateTime: startTime,
@ -417,3 +418,73 @@ func (s *AiStorage) GetInferDeployInstanceById(id int64) (*models.AiInferDeployI
}
return &deployIns, nil
}
func (s *AiStorage) GetInferDeployInstanceList() ([]*models.AiInferDeployInstance, error) {
var list []*models.AiInferDeployInstance
tx := s.DbEngin.Raw("select * from ai_infer_deploy_instance").Scan(&list)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, tx.Error
}
return list, nil
}
func (s *AiStorage) GetInferDeployInstanceTotalNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from ai_infer_deploy_instance").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}
func (s *AiStorage) GetInferDeployInstanceRunningNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from ai_infer_deploy_instance where `status` = 'running'").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}
func (s *AiStorage) GetInferenceTaskTotalNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from task where `task_type_dict` = 11 or `task_type_dict` = 12").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}
func (s *AiStorage) GetInferenceTaskRunningNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from task where `task_type_dict` = 11 and `status` = 'Running'").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}
func (s *AiStorage) GetTrainingTaskTotalNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from task where `task_type_dict` = 10").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}
func (s *AiStorage) GetTrainingTaskRunningNum() (int32, error) {
var total int32
tx := s.DbEngin.Raw("select count(*) from task where `task_type_dict` = 11 and `status` = 'Running'").Scan(&total)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return 0, tx.Error
}
return total, nil
}

View File

@ -0,0 +1,51 @@
package updater
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"net/http"
"strconv"
)
func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInferDeployInstance) {
amap, found := svc.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)]
if !found {
return
}
cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)]
if !found {
return
}
h := http.Request{}
ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId)
if err != nil {
return
}
switch instance.ClusterType {
case storeLink.TYPE_OCTOPUS:
switch ins.Status {
case "running":
instance.Status = constants.Running
case "stopped":
instance.Status = constants.Stopped
default:
instance.Status = ins.Status
}
case storeLink.TYPE_SHUGUANGAI:
switch ins.Status {
case "running":
instance.Status = constants.Running
case "Terminated":
instance.Status = constants.Stopped
default:
instance.Status = ins.Status
}
}
err = svc.Scheduler.AiStorages.UpdateInferDeployInstance(instance)
if err != nil {
return
}
}

View File

@ -877,6 +877,7 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio
Platform: o.platform,
PageIndex: o.pageIndex,
PageSize: o.pageSize,
SearchKey: DEPLOY_INSTANCE_PREFIEX,
}
list, err := o.octopusRpc.GetNotebookList(ctx, req)
if err != nil {
@ -885,9 +886,9 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio
var imageUrls []*inference.InferUrl
for _, notebook := range list.Payload.GetNotebooks() {
if strings.Contains(notebook.AlgorithmName, option.ModelName) && notebook.Status == "running" {
if strings.Contains(notebook.Desc, option.ModelName) && notebook.Status == "running" {
url := strings.Replace(notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
names := strings.Split(notebook.AlgorithmName, UNDERSCORE)
names := strings.Split(notebook.Desc, FORWARD_SLASH)
imageUrl := &inference.InferUrl{
Url: DOMAIN + url,
Card: names[2],
@ -955,5 +956,23 @@ func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bo
}
func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
return nil, nil
ins := &inference.DeployInstance{}
req := &octopus.GetNotebookReq{
Platform: o.platform,
Id: id,
}
resp, err := o.octopusRpc.GetNotebook(ctx, req)
if err != nil {
return nil, err
}
if resp.Payload == nil {
return nil, errors.New("instance does not exist")
}
ins.InstanceName = resp.Payload.Notebook.Name
ins.InstanceId = resp.Payload.Notebook.Id
ins.ClusterName = o.platform
ins.Status = resp.Payload.Notebook.Status
return ins, nil
}

View File

@ -851,5 +851,19 @@ func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) boo
}
func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
return nil, nil
ins := &inference.DeployInstance{}
req := &hpcAC.GetInstanceServiceDetailReq{
Id: id,
}
resp, err := s.aCRpc.GetInstanceServiceDetail(ctx, req)
if err != nil || resp.Code != "0" {
return nil, err
}
ins.InstanceName = resp.Data.InstanceServiceName
ins.InstanceId = resp.Data.Id
ins.ClusterName = s.platform
ins.Status = resp.Data.Status
ins.InferCard = DCU
ins.CreatedTime = resp.Data.CreateTime
return ins, nil
}

View File

@ -79,8 +79,8 @@ var (
4: "制作失败",
}
ModelTypeMap = map[string][]string{
"image_recognition": {"imagenet_resnet50"},
"text_to_text": {"chatGLM_6B"},
"image_classification": {"imagenet_resnet50"},
"text_to_text": {"chatGLM_6B"},
}
AITYPE = map[string]string{
"1": OCTOPUS,

View File

@ -2902,6 +2902,11 @@ type AiTask struct {
TimeElapsed int32 `json:"elapsed,optional"`
}
type TrainingTaskStatResp struct {
Running int32 `json:"running"`
Total int32 `json:"total"`
}
type ChatReq struct {
ApiUrl string `json:"apiUrl"`
Method string `json:"method,optional"`
@ -5975,7 +5980,8 @@ type DeployInstanceListResp struct {
type StartDeployInstanceReq struct {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
InstanceId string `form:"id"`
Id string `form:"id"`
InstanceId string `form:"instanceId"`
}
type StartDeployInstanceResp struct {
@ -5984,8 +5990,25 @@ type StartDeployInstanceResp struct {
type StopDeployInstanceReq struct {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
InstanceId string `form:"id"`
Id string `form:"id"`
InstanceId string `form:"instanceId"`
}
type StopDeployInstanceResp struct {
}
type DeployInstanceStatReq struct {
}
type DeployInstanceStatResp struct {
Running int32 `json:"running"`
Total int32 `json:"total"`
}
type InferenceTaskStatReq struct {
}
type InferenceTaskStatResp struct {
Running int32 `json:"running"`
Total int32 `json:"total"`
}

View File

@ -48,6 +48,7 @@ type (
Status string `db:"status"`
CreateTime string `db:"create_time"`
UpdateTime string `db:"update_time"`
ClusterType string `db:"cluster_type"`
}
)