File size: 3,119 Bytes
6755a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from typing import Union, List, Dict, Tuple, Literal

import logging


def convert_byte_unit(
    value: float,
    src_unit: Literal["b", "B", "KB", "MB", "GB", "TB"],
    target_unit: Literal["b", "B", "KB", "MB", "GB", "TB"],
) -> float:
    """convert value in src_unit to target_unit. Firstlt, all src_unit to Byte, then to target_unit

    Args:
        value (float): _description_
        src_unit (Literal["b", "B", "KB", "MB", "GB", "TB"]): _description_
        target_unit (Literal["b", "B", "KB", "MB", "GB", "TB"]): _description_

    Raises:
        ValueError: _description_
        ValueError: _description_

    Returns:
        float: _description_
    """
    if src_unit in ["b", "bit"]:
        value = value / 8
    elif src_unit in ["B", "Byte"]:
        pass
    elif src_unit == "KB":
        value = value * 1024
    elif src_unit == "MB":
        value = value * 1024**2
    elif src_unit == "GB":
        value = value * (1024**3)
    elif src_unit == "TB":
        value = value * (1024**4)
    else:
        raise ValueError("src_unit is not valid")
    if target_unit in ["b", "bit"]:
        target_value = value * 8
    elif target_unit in ["B", "Byte"]:
        target_value = value
    elif target_unit == "KB":
        target_value = value / 1024
    elif target_unit == "MB":
        target_value = value / 1024**2
    elif target_unit == "GB":
        target_value = value / (1024**3)
    elif target_unit == "TB":
        target_value = value / (1024**4)
    else:
        raise ValueError("target_unit is not valid")
    return target_value


def get_gpu_status(unit="MB") -> List[Dict]:
    import pynvml

    try:
        infos = []

        # 初始化 pynvml
        pynvml.nvmlInit()
        # 获取 GPU 数量
        deviceCount = pynvml.nvmlDeviceGetCount()

        # 获取每个 GPU 的信息
        for i in range(deviceCount):
            gpu_info = {}
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
            gpu_name = pynvml.nvmlDeviceGetName(handle)
            gpu_info = {
                "gpu_name": gpu_name,
                "total_memory": convert_byte_unit(
                    info.total, src_unit="B", target_unit=unit
                ),
                "used_memory": convert_byte_unit(
                    info.used, src_unit="B", target_unit=unit
                ),
                "used_memory_ratio": info.used / info.total,
                "gpu_utilization": utilization.gpu,
                "free_memory_ratio": info.free / info.total,
                "free_memory": convert_byte_unit(
                    info.free, src_unit="B", target_unit=unit
                ),
            }
            infos.append(gpu_info)
        # 释放 pynvml
        pynvml.nvmlShutdown()
    except Exception as e:
        print("get_gpu_status failed")
        logging.exception(e)
    return infos