Documentation
¶
Overview ¶
Package device a series of device function.
Package device a series of device function ¶
Package device a series of device function ¶
Package device a series of device function ¶
Package device a series of device function ¶
Package device a series of device function ¶
Package device a series of device function ¶
Package device a series of device function
Index ¶
- func AddBusyDev(cardID, deviceID int32)
- func AddResetCnt(cardID, deviceID int32)
- func ClassifyDevices(allDevs []common.NpuDevice, devTypes []string) map[string][]*common.NpuDevice
- func FreeBusyDev(cardID, deviceID int32)
- func GetResetCnt(cardID, deviceID int32) int
- func InitResetInfoMgr(client *kubeclient.ClientK8s)
- func IsDevBusy(cardID, deviceID int32) bool
- func SetResetCnt(cardID, deviceID int32, cnt int)
- func WriteResetInfo(resetInfo ResetInfo, writeMode WriteMode, updateNode bool)
- type AscendTools
- func (tool *AscendTools) AddPodAnnotation(podDev *common.PodDeviceInfo, deviceType, serverID string, ...) error
- func (tool *AscendTools) AppendVGroupInfo(allocateDevice []string)
- func (tool *AscendTools) CheckDeviceTypeLabel() error
- func (tool *AscendTools) CreateVirtualDevice(phyID int32, templateName string) (string, error)
- func (tool *AscendTools) DestroyVirtualDevice(deviceName string) error
- func (tool *AscendTools) GetChange(groupDevice, oldGroupDevice map[string][]*common.NpuDevice) map[string]bool
- func (tool *AscendTools) GetChipAICore() int32
- func (tool *AscendTools) GetChipAiCoreCount() (int32, error)
- func (tool *AscendTools) GetContainerdClient() *containerd.Client
- func (tool *AscendTools) GetDeviceIP(deviceType string, phyID int) (string, error)
- func (tool *AscendTools) GetDeviceUsage() string
- func (tool *AscendTools) GetDmgr() devmanager.DeviceInterface
- func (tool *AscendTools) GetIfCardsInResetting(deviceLogicId int32) bool
- func (tool *AscendTools) GetKubeClient() *kubeclient.ClientK8s
- func (tool *AscendTools) GetName() string
- func (tool *AscendTools) GetResetFailedTimes(deviceLogicId int32) int
- func (tool *AscendTools) GetServerBoardId(devLogicID int32) (uint32, error)
- func (tool *AscendTools) GetServerIndex() int32
- func (tool *AscendTools) GetSuperPodID() int32
- func (tool *AscendTools) GetUsedChips() sets.String
- func (tool *AscendTools) HandleDropCardFaultEvents(npuDevice *common.NpuDevice)
- func (tool *AscendTools) HandleLostChipFaultEvents(device *common.NpuDevice, initLogicIDs []int32)
- func (tool *AscendTools) HandleLostNetworkFaultEvents(device *common.NpuDevice, initLogicIDs []int32)
- func (tool *AscendTools) LogFaultModeChange(device *common.NpuDevice, initLogicIDs []int32, newMode string)
- func (tool *AscendTools) SetCardsInResetting(deviceLogicId int32, reset bool)
- func (tool *AscendTools) SetContainerdClient(client *containerd.Client)
- func (tool *AscendTools) SetDeviceUsage(devLogicID int32) error
- func (tool *AscendTools) SetDmgr(dmgr devmanager.DeviceInterface)
- func (tool *AscendTools) SetKubeClient(client *kubeclient.ClientK8s)
- func (tool *AscendTools) SetResetFailedTimes(deviceLogicId int32, count int)
- func (tool *AscendTools) SetServerIndex(serverIndex int32)
- func (tool *AscendTools) SetSuperPodID(superPodID int32)
- func (tool *AscendTools) UpdateHealth(groupDevice map[string][]*common.NpuDevice, aiCoreDevs []*common.NpuDevice, ...)
- func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, ...) error
- func (tool *AscendTools) WriteFaultToEvent(ctx context.Context)
- type DevManager
- type HotResetManager
- type HotResetTools
- func (hrt *HotResetTools) DeepCopyDevFaultInfoList(devFaultInfoList []*common.TaskDevInfo) []*common.TaskDevInfo
- func (hrt *HotResetTools) DeepCopyDevInfo(devInfo *common.TaskDevInfo) *common.TaskDevInfo
- func (hrt *HotResetTools) GenerateTaskDevFaultInfoList(devIdList []int32, rankIndex string) ([]*common.TaskDevInfo, error)
- func (hrt *HotResetTools) GetAllTaskDevFaultInfoList() map[string][]*common.TaskDevInfo
- func (hrt *HotResetTools) GetCMFromCache(cmKey string) (*v1.ConfigMap, error)
- func (hrt *HotResetTools) GetDevIdList(devStr string) []int32
- func (hrt *HotResetTools) GetDevListByPolicyLevel(devFaultInfoList []*common.TaskDevInfo, policyLevel int) (map[int32]struct{}, error)
- func (hrt *HotResetTools) GetDevListInReset() map[int32]struct{}
- func (hrt *HotResetTools) GetDevProcessPolicy(faultType string) string
- func (hrt *HotResetTools) GetFaultDev2PodMap() (map[int32]v1.Pod, error)
- func (hrt *HotResetTools) GetGlobalDevFaultInfo(logicID int32) (*common.DevFaultInfo, error)
- func (hrt *HotResetTools) GetNeedResetDevMap(devFaultInfoList []*common.TaskDevInfo) (map[int32]int32, error)
- func (hrt *HotResetTools) GetResetDevNumOnce() (int, error)
- func (hrt *HotResetTools) GetTaskDevFaultInfoList(taskName string) ([]*common.TaskDevInfo, error)
- func (hrt *HotResetTools) GetTaskFaultRankInfo(devFaultInfoList []*common.TaskDevInfo) (*common.TaskFaultInfo, error)
- func (hrt *HotResetTools) GetTaskNameByPod(pod v1.Pod) string
- func (hrt *HotResetTools) GetTaskPod(taskName string) (v1.Pod, error)
- func (hrt *HotResetTools) GetTaskProcessPolicy(taskName string) (string, int, error)
- func (hrt *HotResetTools) GetTaskResetInfo(devFaultInfoList []*common.TaskDevInfo, policy, initPolicy, status string) (*common.TaskResetInfo, error)
- func (hrt *HotResetTools) IsCurNodeTaskInReset(taskName string) bool
- func (hrt *HotResetTools) IsExistFaultyDevInTask(taskName string) bool
- func (hrt *HotResetTools) SetAllDevInReset(resetInfo *common.TaskResetInfo) error
- func (hrt *HotResetTools) SetDevInReset(devId int32) error
- func (hrt *HotResetTools) SetTaskInReset(taskName string) error
- func (hrt *HotResetTools) SyncResetCM(ctx context.Context, client *kubeclient.ClientK8s)
- func (hrt *HotResetTools) UnSetAllDevInReset(resetInfo *common.TaskResetInfo) error
- func (hrt *HotResetTools) UnSetDevInReset(devId int32) error
- func (hrt *HotResetTools) UnSetTaskInReset(taskName string) error
- func (hrt *HotResetTools) UpdateFaultDev2PodMap(devList []int32, pod v1.Pod) error
- func (hrt *HotResetTools) UpdateFreeTask(taskListUsedDevice map[string]struct{}, newTaskDevList map[string][]int32)
- func (hrt *HotResetTools) UpdateGlobalDevFaultInfoCache(devDeviceList []*common.NpuDevice, isoDevList []int32) error
- func (hrt *HotResetTools) UpdateTaskDevFaultInfoCache(taskDevFaultInfo map[string][]*common.TaskDevInfo) error
- func (hrt *HotResetTools) UpdateTaskDevListCache(taskDevList map[string][]int32) error
- func (hrt *HotResetTools) UpdateTaskPodCache(taskPod map[string]v1.Pod) error
- type HwAscend310Manager
- type HwAscend310PManager
- type HwAscend910Manager
- type ResetDevice
- type ResetInfo
- type ResetInfoMgr
- type WriteMode
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ClassifyDevices ¶
ClassifyDevices classify diff type devices
func FreeBusyDev ¶
func FreeBusyDev(cardID, deviceID int32)
FreeBusyDev remove a device from busy map
func GetResetCnt ¶
GetResetCnt get device reset count by physic ID
func InitResetInfoMgr ¶
func InitResetInfoMgr(client *kubeclient.ClientK8s)
InitResetInfoMgr initialize ResetInfoMgr globally
func IsDevBusy ¶
IsDevBusy check whether one device is busy, for example in reset, wait third party reset or wait manually reset
func SetResetCnt ¶
SetResetCnt set device reset count
func WriteResetInfo ¶
WriteResetInfo write reset info into cache and node annotation
Types ¶
type AscendTools ¶
type AscendTools struct {
// contains filtered or unexported fields
}
AscendTools struct definition
func (*AscendTools) AddPodAnnotation ¶
func (tool *AscendTools) AddPodAnnotation(podDev *common.PodDeviceInfo, deviceType, serverID string, allDevices []common.NpuDevice) error
AddPodAnnotation check and update pod annotations correct annotation 'AscendReal', 'Ascend910', 'kltDev', 'ascend-910-configuration' per 5s
func (*AscendTools) AppendVGroupInfo ¶
func (tool *AscendTools) AppendVGroupInfo(allocateDevice []string)
AppendVGroupInfo append virtual group id info after device name
func (*AscendTools) CheckDeviceTypeLabel ¶
func (tool *AscendTools) CheckDeviceTypeLabel() error
CheckDeviceTypeLabel check device type label
func (*AscendTools) CreateVirtualDevice ¶
func (tool *AscendTools) CreateVirtualDevice(phyID int32, templateName string) (string, error)
CreateVirtualDevice create virtual device
func (*AscendTools) DestroyVirtualDevice ¶
func (tool *AscendTools) DestroyVirtualDevice(deviceName string) error
DestroyVirtualDevice destroy virtual device
func (*AscendTools) GetChange ¶
func (tool *AscendTools) GetChange(groupDevice, oldGroupDevice map[string][]*common.NpuDevice) map[string]bool
GetChange check if groupDevice changes
func (*AscendTools) GetChipAICore ¶
func (tool *AscendTools) GetChipAICore() int32
GetChipAICore get ai core
func (*AscendTools) GetChipAiCoreCount ¶
func (tool *AscendTools) GetChipAiCoreCount() (int32, error)
GetChipAiCoreCount get chip aicore count
func (*AscendTools) GetContainerdClient ¶
func (tool *AscendTools) GetContainerdClient() *containerd.Client
GetContainerdClient get containerd Client
func (*AscendTools) GetDeviceIP ¶
func (tool *AscendTools) GetDeviceIP(deviceType string, phyID int) (string, error)
GetDeviceIP get device ip
func (*AscendTools) GetDeviceUsage ¶
func (tool *AscendTools) GetDeviceUsage() string
GetDeviceUsage return usage of device, infer or train
func (*AscendTools) GetDmgr ¶
func (tool *AscendTools) GetDmgr() devmanager.DeviceInterface
GetDmgr get devmanager
func (*AscendTools) GetIfCardsInResetting ¶
func (tool *AscendTools) GetIfCardsInResetting(deviceLogicId int32) bool
GetIfCardsInResetting get whether all cards in resetting process
func (*AscendTools) GetKubeClient ¶
func (tool *AscendTools) GetKubeClient() *kubeclient.ClientK8s
GetKubeClient get ClientK8s
func (*AscendTools) GetResetFailedTimes ¶
func (tool *AscendTools) GetResetFailedTimes(deviceLogicId int32) int
GetResetFailedTimes get how many times has the reset process failed in a row
func (*AscendTools) GetServerBoardId ¶
func (tool *AscendTools) GetServerBoardId(devLogicID int32) (uint32, error)
GetServerBoardId get server board id
func (*AscendTools) GetServerIndex ¶
func (tool *AscendTools) GetServerIndex() int32
GetServerIndex getting the index from server
func (*AscendTools) GetSuperPodID ¶
func (tool *AscendTools) GetSuperPodID() int32
GetSuperPodID getting super pod id
func (*AscendTools) GetUsedChips ¶
func (tool *AscendTools) GetUsedChips() sets.String
GetUsedChips return chips used by process and containerd
func (*AscendTools) HandleDropCardFaultEvents ¶
func (tool *AscendTools) HandleDropCardFaultEvents(npuDevice *common.NpuDevice)
HandleDropCardFaultEvents handle drop card fault events that may be lost by the fault subscription interface
func (*AscendTools) HandleLostChipFaultEvents ¶
func (tool *AscendTools) HandleLostChipFaultEvents(device *common.NpuDevice, initLogicIDs []int32)
HandleLostChipFaultEvents handle chip fault events that may be lost by the fault subscription interface
func (*AscendTools) HandleLostNetworkFaultEvents ¶
func (tool *AscendTools) HandleLostNetworkFaultEvents(device *common.NpuDevice, initLogicIDs []int32)
HandleLostNetworkFaultEvents handle network fault events that may be lost by the fault subscription interface
func (*AscendTools) LogFaultModeChange ¶
func (tool *AscendTools) LogFaultModeChange(device *common.NpuDevice, initLogicIDs []int32, newMode string)
LogFaultModeChange print logs when fault mode changed
func (*AscendTools) SetCardsInResetting ¶
func (tool *AscendTools) SetCardsInResetting(deviceLogicId int32, reset bool)
SetCardsInResetting set the indicator of whether all cards in resetting process
func (*AscendTools) SetContainerdClient ¶
func (tool *AscendTools) SetContainerdClient(client *containerd.Client)
SetContainerdClient set containerd Client
func (*AscendTools) SetDeviceUsage ¶
func (tool *AscendTools) SetDeviceUsage(devLogicID int32) error
SetDeviceUsage set usage of device according to board info
func (*AscendTools) SetDmgr ¶
func (tool *AscendTools) SetDmgr(dmgr devmanager.DeviceInterface)
SetDmgr set devmanager
func (*AscendTools) SetKubeClient ¶
func (tool *AscendTools) SetKubeClient(client *kubeclient.ClientK8s)
SetKubeClient set ClientK8s
func (*AscendTools) SetResetFailedTimes ¶
func (tool *AscendTools) SetResetFailedTimes(deviceLogicId int32, count int)
SetResetFailedTimes set the counter of how many times the reset process has failed
func (*AscendTools) SetServerIndex ¶
func (tool *AscendTools) SetServerIndex(serverIndex int32)
SetServerIndex setting the index from server
func (*AscendTools) SetSuperPodID ¶
func (tool *AscendTools) SetSuperPodID(superPodID int32)
SetSuperPodID set super pod id
func (*AscendTools) UpdateHealth ¶
func (tool *AscendTools) UpdateHealth(groupDevice map[string][]*common.NpuDevice, aiCoreDevs []*common.NpuDevice, runMode string)
UpdateHealth update group device healthy
func (*AscendTools) UpdateNodeDeviceInfo ¶
func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, updateDeviceInfoFunc func(map[string]string, map[string]string, common.DevStatusSet) error) error
UpdateNodeDeviceInfo update device info
func (*AscendTools) WriteFaultToEvent ¶
func (tool *AscendTools) WriteFaultToEvent(ctx context.Context)
WriteFaultToEvent write fault to event
type DevManager ¶
type DevManager interface {
GetNPUs() (common.NpuAllInfo, error)
DoWithVolcanoListAndWatch(map[string][]*common.NpuDevice)
GraceTolerance(context.Context, map[string][]*common.NpuDevice)
SetDmgr(devmanager.DeviceInterface)
GetDmgr() devmanager.DeviceInterface
GetChipAICore() int32
GetName() string
SetKubeClient(*kubeclient.ClientK8s)
GetKubeClient() *kubeclient.ClientK8s
SetContainerdClient(*containerd.Client)
GetContainerdClient() *containerd.Client
UpdateHealth(map[string][]*common.NpuDevice, []*common.NpuDevice, string)
GetChange(map[string][]*common.NpuDevice, map[string][]*common.NpuDevice) map[string]bool
AddPodAnnotation(*common.PodDeviceInfo, string, string, []common.NpuDevice) error
AppendVGroupInfo([]string)
CheckDeviceTypeLabel() error
CreateVirtualDevice(int32, string) (string, error)
DestroyVirtualDevice(string) error
GetChipAiCoreCount() (int32, error)
SetDeviceUsage(int32) error
GetDeviceUsage() string
SetSuperPodID(superPodID int32)
GetSuperPodID() int32
SetServerIndex(serverIndex int32)
GetServerIndex() int32
GetServerBoardId(devLogicID int32) (uint32, error)
SetCardsInResetting(int32, bool)
GetIfCardsInResetting(int32) bool
GetResetFailedTimes(int32) int
SetResetFailedTimes(int32, int)
HandleDropCardFaultEvents(*common.NpuDevice)
HandleLostChipFaultEvents(*common.NpuDevice, []int32)
HandleLostNetworkFaultEvents(*common.NpuDevice, []int32)
LogFaultModeChange(*common.NpuDevice, []int32, string)
GetUsedChips() sets.String
GetDeviceIP(deviceType string, phyID int) (string, error)
WriteFaultToEvent(ctx context.Context)
}
DevManager interface for manager device
type HotResetManager ¶
type HotResetManager interface {
GetResetDevNumOnce() (int, error)
GetDevIdList(string) []int32
GetTaskDevFaultInfoList(string) ([]*common.TaskDevInfo, error)
GetTaskPod(string) (v1.Pod, error)
GetAllTaskDevFaultInfoList() map[string][]*common.TaskDevInfo
GetDevProcessPolicy(string) string
GetTaskProcessPolicy(string) (string, int, error)
GetDevListInReset() map[int32]struct{}
GetDevListByPolicyLevel([]*common.TaskDevInfo, int) (map[int32]struct{}, error)
GetNeedResetDevMap([]*common.TaskDevInfo) (map[int32]int32, error)
GetGlobalDevFaultInfo(logicID int32) (*common.DevFaultInfo, error)
GetTaskResetInfo([]*common.TaskDevInfo, string, string, string) (*common.TaskResetInfo, error)
GetTaskFaultRankInfo([]*common.TaskDevInfo) (*common.TaskFaultInfo, error)
GetFaultDev2PodMap() (map[int32]v1.Pod, error)
GetTaskNameByPod(pod v1.Pod) string
GenerateTaskDevFaultInfoList(devIdList []int32, rankIndex string) ([]*common.TaskDevInfo, error)
UpdateFaultDev2PodMap([]int32, v1.Pod) error
UpdateGlobalDevFaultInfoCache([]*common.NpuDevice, []int32) error
UpdateTaskDevListCache(map[string][]int32) error
UpdateTaskDevFaultInfoCache(map[string][]*common.TaskDevInfo) error
UpdateTaskPodCache(map[string]v1.Pod) error
UpdateFreeTask(map[string]struct{}, map[string][]int32)
SetTaskInReset(string) error
SetDevInReset(int32) error
SetAllDevInReset(info *common.TaskResetInfo) error
UnSetTaskInReset(string) error
UnSetDevInReset(int32) error
UnSetAllDevInReset(*common.TaskResetInfo) error
IsCurNodeTaskInReset(string) bool
IsExistFaultyDevInTask(string) bool
DeepCopyDevInfo(*common.TaskDevInfo) *common.TaskDevInfo
DeepCopyDevFaultInfoList([]*common.TaskDevInfo) []*common.TaskDevInfo
SyncResetCM(context.Context, *kubeclient.ClientK8s)
GetCMFromCache(string) (*v1.ConfigMap, error)
}
HotResetManager hot reset manager
func NewHotResetManager ¶
func NewHotResetManager(devUsage string, deviceNum int) HotResetManager
NewHotResetManager create HotResetManager and init data
type HotResetTools ¶
type HotResetTools struct {
// contains filtered or unexported fields
}
HotResetTools hot reset tool
func (*HotResetTools) DeepCopyDevFaultInfoList ¶
func (hrt *HotResetTools) DeepCopyDevFaultInfoList(devFaultInfoList []*common.TaskDevInfo) []*common.TaskDevInfo
DeepCopyDevFaultInfoList copy device fault info list deeply
func (*HotResetTools) DeepCopyDevInfo ¶
func (hrt *HotResetTools) DeepCopyDevInfo(devInfo *common.TaskDevInfo) *common.TaskDevInfo
DeepCopyDevInfo copy device info deeply
func (*HotResetTools) GenerateTaskDevFaultInfoList ¶
func (hrt *HotResetTools) GenerateTaskDevFaultInfoList(devIdList []int32, rankIndex string) ([]*common.TaskDevInfo, error)
GenerateTaskDevFaultInfoList generate device fault info list in a task by device logic id list and rank index
func (*HotResetTools) GetAllTaskDevFaultInfoList ¶
func (hrt *HotResetTools) GetAllTaskDevFaultInfoList() map[string][]*common.TaskDevInfo
GetAllTaskDevFaultInfoList return all task device fault info list
func (*HotResetTools) GetCMFromCache ¶
func (hrt *HotResetTools) GetCMFromCache(cmKey string) (*v1.ConfigMap, error)
GetCMFromCache get configmap from indexer cache
func (*HotResetTools) GetDevIdList ¶
func (hrt *HotResetTools) GetDevIdList(devStr string) []int32
GetDevIdList convert device str to device logic id list
func (*HotResetTools) GetDevListByPolicyLevel ¶
func (hrt *HotResetTools) GetDevListByPolicyLevel(devFaultInfoList []*common.TaskDevInfo, policyLevel int) (map[int32]struct{}, error)
GetDevListByPolicyLevel return the dev list by policy level
func (*HotResetTools) GetDevListInReset ¶
func (hrt *HotResetTools) GetDevListInReset() map[int32]struct{}
GetDevListInReset return the logic id list of device in reset
func (*HotResetTools) GetDevProcessPolicy ¶
func (hrt *HotResetTools) GetDevProcessPolicy(faultType string) string
GetDevProcessPolicy return the policy of device with fault
func (*HotResetTools) GetFaultDev2PodMap ¶
func (hrt *HotResetTools) GetFaultDev2PodMap() (map[int32]v1.Pod, error)
GetFaultDev2PodMap return map which contains fault device and pod
func (*HotResetTools) GetGlobalDevFaultInfo ¶
func (hrt *HotResetTools) GetGlobalDevFaultInfo(logicID int32) (*common.DevFaultInfo, error)
GetGlobalDevFaultInfo return global device fault info from cache using input logic id
func (*HotResetTools) GetNeedResetDevMap ¶
func (hrt *HotResetTools) GetNeedResetDevMap(devFaultInfoList []*common.TaskDevInfo) (map[int32]int32, error)
GetNeedResetDevMap return device logic id list to be reset
func (*HotResetTools) GetResetDevNumOnce ¶
func (hrt *HotResetTools) GetResetDevNumOnce() (int, error)
GetResetDevNumOnce get reset device num at a time
func (*HotResetTools) GetTaskDevFaultInfoList ¶
func (hrt *HotResetTools) GetTaskDevFaultInfoList(taskName string) ([]*common.TaskDevInfo, error)
GetTaskDevFaultInfoList return task device fault info list
func (*HotResetTools) GetTaskFaultRankInfo ¶
func (hrt *HotResetTools) GetTaskFaultRankInfo(devFaultInfoList []*common.TaskDevInfo) (*common.TaskFaultInfo, error)
GetTaskFaultRankInfo return the fault rank info of task to update fault cm
func (*HotResetTools) GetTaskNameByPod ¶
func (hrt *HotResetTools) GetTaskNameByPod(pod v1.Pod) string
GetTaskNameByPod get task name which written by volcano or operator
func (*HotResetTools) GetTaskPod ¶
func (hrt *HotResetTools) GetTaskPod(taskName string) (v1.Pod, error)
GetTaskPod return task pod
func (*HotResetTools) GetTaskProcessPolicy ¶
func (hrt *HotResetTools) GetTaskProcessPolicy(taskName string) (string, int, error)
GetTaskProcessPolicy return a task process policy
func (*HotResetTools) GetTaskResetInfo ¶
func (hrt *HotResetTools) GetTaskResetInfo(devFaultInfoList []*common.TaskDevInfo, policy, initPolicy, status string) (*common.TaskResetInfo, error)
GetTaskResetInfo return the detail reset info of task to process
func (*HotResetTools) IsCurNodeTaskInReset ¶
func (hrt *HotResetTools) IsCurNodeTaskInReset(taskName string) bool
IsCurNodeTaskInReset check whether the current task is being reset on the current node
func (*HotResetTools) IsExistFaultyDevInTask ¶
func (hrt *HotResetTools) IsExistFaultyDevInTask(taskName string) bool
IsExistFaultyDevInTask check if any fault device exist on current task
func (*HotResetTools) SetAllDevInReset ¶
func (hrt *HotResetTools) SetAllDevInReset(resetInfo *common.TaskResetInfo) error
SetAllDevInReset set all device in a task to the reset state
func (*HotResetTools) SetDevInReset ¶
func (hrt *HotResetTools) SetDevInReset(devId int32) error
SetDevInReset set a device to the reset state
func (*HotResetTools) SetTaskInReset ¶
func (hrt *HotResetTools) SetTaskInReset(taskName string) error
SetTaskInReset set a task to the reset state
func (*HotResetTools) SyncResetCM ¶
func (hrt *HotResetTools) SyncResetCM(ctx context.Context, client *kubeclient.ClientK8s)
SyncResetCM sync reset-cm event
func (*HotResetTools) UnSetAllDevInReset ¶
func (hrt *HotResetTools) UnSetAllDevInReset(resetInfo *common.TaskResetInfo) error
UnSetAllDevInReset unset all device in a task to leave the reset state
func (*HotResetTools) UnSetDevInReset ¶
func (hrt *HotResetTools) UnSetDevInReset(devId int32) error
UnSetDevInReset unset a device in a task to leave the reset state
func (*HotResetTools) UnSetTaskInReset ¶
func (hrt *HotResetTools) UnSetTaskInReset(taskName string) error
UnSetTaskInReset unset a task to leave the reset state
func (*HotResetTools) UpdateFaultDev2PodMap ¶
func (hrt *HotResetTools) UpdateFaultDev2PodMap(devList []int32, pod v1.Pod) error
UpdateFaultDev2PodMap updates the mapping between the unhealthy device and pod
func (*HotResetTools) UpdateFreeTask ¶
func (hrt *HotResetTools) UpdateFreeTask(taskListUsedDevice map[string]struct{}, newTaskDevList map[string][]int32)
UpdateFreeTask unset task in reset task after delete task
func (*HotResetTools) UpdateGlobalDevFaultInfoCache ¶
func (hrt *HotResetTools) UpdateGlobalDevFaultInfoCache(devDeviceList []*common.NpuDevice, isoDevList []int32) error
UpdateGlobalDevFaultInfoCache update global device fault info cache
func (*HotResetTools) UpdateTaskDevFaultInfoCache ¶
func (hrt *HotResetTools) UpdateTaskDevFaultInfoCache(taskDevFaultInfo map[string][]*common.TaskDevInfo) error
UpdateTaskDevFaultInfoCache update all task device fault info cache
func (*HotResetTools) UpdateTaskDevListCache ¶
func (hrt *HotResetTools) UpdateTaskDevListCache(taskDevList map[string][]int32) error
UpdateTaskDevListCache update all task device list cache
func (*HotResetTools) UpdateTaskPodCache ¶
func (hrt *HotResetTools) UpdateTaskPodCache(taskPod map[string]v1.Pod) error
UpdateTaskPodCache update all task pod cache
type HwAscend310Manager ¶
type HwAscend310Manager struct {
AscendTools
}
HwAscend310Manager manages huawei Ascend310 devices.
func NewHwAscend310Manager ¶
func NewHwAscend310Manager() *HwAscend310Manager
NewHwAscend310Manager used to create ascend 310 manager
func (*HwAscend310Manager) DoWithVolcanoListAndWatch ¶
func (hnm *HwAscend310Manager) DoWithVolcanoListAndWatch(classifyDevs map[string][]*common.NpuDevice)
DoWithVolcanoListAndWatch ascend310 watch device
func (*HwAscend310Manager) GetNPUs ¶
func (hnm *HwAscend310Manager) GetNPUs() (common.NpuAllInfo, error)
GetNPUs Discovers all HUAWEI Ascend310 devices by call devmanager interface
func (*HwAscend310Manager) GraceTolerance ¶
GraceTolerance graceful fault tolerance, not supported currently
type HwAscend310PManager ¶
type HwAscend310PManager struct {
AscendTools
}
HwAscend310PManager manages huawei Ascend310P devices.
func NewHwAscend310PManager ¶
func NewHwAscend310PManager() *HwAscend310PManager
NewHwAscend310PManager used to create ascend 310P manager
func (*HwAscend310PManager) DoWithVolcanoListAndWatch ¶
func (hnm *HwAscend310PManager) DoWithVolcanoListAndWatch(classifyDevs map[string][]*common.NpuDevice)
DoWithVolcanoListAndWatch ascend310P affinity scheduling
func (*HwAscend310PManager) GetNPUs ¶
func (hnm *HwAscend310PManager) GetNPUs() (common.NpuAllInfo, error)
GetNPUs Discovers all HUAWEI Ascend310P devices by call devmanager interface
func (*HwAscend310PManager) GraceTolerance ¶
GraceTolerance graceful fault tolerance, not supported currently
type HwAscend910Manager ¶
type HwAscend910Manager struct {
AscendTools
// contains filtered or unexported fields
}
HwAscend910Manager manages huawei Ascend910 devices.
func NewHwAscend910Manager ¶
func NewHwAscend910Manager() *HwAscend910Manager
NewHwAscend910Manager is used to create ascend 910 manager
func (*HwAscend910Manager) DoWithVolcanoListAndWatch ¶
func (hnm *HwAscend910Manager) DoWithVolcanoListAndWatch(classifyDevs map[string][]*common.NpuDevice)
DoWithVolcanoListAndWatch ascend910 affinity scheduling
func (*HwAscend910Manager) GetNPUs ¶
func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error)
GetNPUs Discovers all HUAWEI Ascend910 devices by call devmanager interface a physical npu can be split into multiple vNPU vNPU is classification by computing power, like Ascend910-4c, Ascend910-8c, Ascend910-16c physical npu sets corresponding to the deviTypes, and vNPU is vDeviTypes vDeviTypes may is: [Ascend910-4c, Ascend910-4c, Ascend910-8c], also deviTypes may is: [Ascend910, Ascend910] one class deviType will generate a socket file, like ascend910-4c.sock or Ascend910.sock, so we deduplicate
func (*HwAscend910Manager) GraceTolerance ¶
func (hnm *HwAscend910Manager) GraceTolerance(ctx context.Context, classifyDevs map[string][]*common.NpuDevice)
GraceTolerance process training task with device fault gracefully
type ResetDevice ¶
type ResetDevice struct {
// CardId npu card id
CardId int32
// DeviceId npu device id
DeviceId int32
// AssociatedCardId card id of the associated npu
AssociatedCardId int32
// PhyId npu physic id
PhyID int32
// LogicID npu logic id
LogicID int32
}
ResetDevice device that fail to be reset
type ResetInfo ¶
type ResetInfo struct {
// ThirdPartyResetDevs devices waits for third party to reset
ThirdPartyResetDevs []ResetDevice
// ManualResetDevs devices waits for manually reset
ManualResetDevs []ResetDevice
}
ResetInfo information of npu reset
type ResetInfoMgr ¶
type ResetInfoMgr struct {
// contains filtered or unexported fields
}
ResetInfoMgr mgr for npu reset
func GetResetInfoMgr ¶
func GetResetInfoMgr() *ResetInfoMgr
GetResetInfoMgr return the single instance of reset mgr, load reset info from node annotation
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
Package deviceswitch functions of getting switch faults code
|
Package deviceswitch functions of getting switch faults code |