Skip to content

Commit 70bc46e

Browse files
committed
Add pid and per-pid GPU memory usage to get_gpu_info_c
1 parent 88a4bf7 commit 70bc46e

File tree

3 files changed

+111
-25
lines changed

3 files changed

+111
-25
lines changed

src/gpu/utils.cu

+46-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,16 @@ int get_compute_capability(int d_idx, int *major, int *minor, int *ratioperf) {
5252
}
5353

5454

55-
void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long long *gpu_total_memory, unsigned long long *gpu_free_memory, char **gpu_name, int *majors, int *minors) {
55+
void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long long *gpu_total_memory,
56+
unsigned long long *gpu_free_memory,
57+
char **gpu_name,
58+
int *majors, int *minors,
59+
unsigned int *num_pids, unsigned int *pids, unsigned long long *usedGpuMemorys) {
60+
61+
bool verbose=false;
62+
if(verbose){
63+
std::cerr << "inside get_gpu_info_c c function" << std::endl;
64+
}
5665

5766
nvmlReturn_t rv;
5867
rv = nvmlInit();
@@ -69,17 +78,25 @@ void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long
6978
nvmlReturn_t rv;
7079
rv = nvmlDeviceGetHandleByIndex(i, &device);
7180
assert(rv == NVML_SUCCESS);
81+
7282
nvmlUtilization_t utilization;
7383
rv = nvmlDeviceGetUtilizationRates(device, &utilization);
7484
assert(rv == NVML_SUCCESS);
7585
gpu_percent_usage[i] = utilization.gpu;
86+
87+
if(verbose){
88+
std::cerr << "i=" << i << " usage=" << gpu_percent_usage[i] << std::endl;
89+
}
90+
7691
nvmlMemory_t memory;
7792
rv = nvmlDeviceGetMemoryInfo(device, &memory);
7893
assert(rv == NVML_SUCCESS);
7994
gpu_total_memory[i] = memory.total;
8095
gpu_free_memory[i] = memory.free;
96+
8197
rv = nvmlDeviceGetName(device, gpu_name[i], 30);
8298
assert(rv == NVML_SUCCESS);
99+
83100
#if (CUDART_VERSION >= 9000)
84101
rv = nvmlDeviceGetCudaComputeCapability(device, &majors[i], &minors[i]);
85102
assert(rv == NVML_SUCCESS);
@@ -91,6 +108,34 @@ void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long
91108
majors[i] = -1;
92109
minors[i] = -1;
93110
#endif
111+
112+
if(verbose){
113+
std::cerr << "i=" << i << " majors=" << majors[i] << " minors=" << minors[i] << std::endl;
114+
}
115+
116+
unsigned int max_pids=2000;
117+
unsigned int infoCount;
118+
//rv = nvmlDeviceGetComputeRunningProcesses(device, &infoCount, NULL);
119+
//assert(rv == NVML_SUCCESS);
120+
infoCount = max_pids;
121+
nvmlProcessInfo_t infos[infoCount];
122+
unsigned int num_pid_local;
123+
num_pids[i] = infoCount;
124+
rv = nvmlDeviceGetComputeRunningProcesses(device, &num_pids[i], infos);
125+
assert(rv == NVML_SUCCESS);
126+
if(num_pids[i] > max_pids){
127+
std::cerr << "Too many pids: " << num_pids[i] << " . Increase max_pids: " << max_pids << std::endl;
128+
assert(num_pids[i] <= max_pids);
129+
}
130+
for (unsigned int pidi=0; pidi < num_pids[i]; pidi++) {
131+
pids[pidi + i * max_pids] = infos[pidi].pid;
132+
usedGpuMemorys[pidi + i * max_pids] = infos[pidi].usedGpuMemory;
133+
134+
if(verbose){
135+
std::cerr << "i=" << i << " pidi=" << pidi << " pids=" << pids[pidi + i * max_pids] << " gpumemory=" << usedGpuMemorys[pidi + i * max_pids] << std::endl;
136+
}
137+
138+
}
94139
}
95140

96141
}

src/interface_py/h2o4gpu/util/gpu.py

+49-20
Original file line numberDiff line numberDiff line change
@@ -148,33 +148,42 @@ def get_gpu_info_subprocess(return_usage=False):
148148
return (total_gpus, total_mem, gpu_type)
149149

150150

151-
def get_gpu_info_c(return_usage=False, return_capability=False,
152-
return_all=False, verbose=False):
151+
def get_gpu_info_c(return_usage=False,
152+
return_free_memory=False,
153+
return_capability=False,
154+
return_memory_by_pid=False,
155+
return_all=False,
156+
verbose=False):
153157
"""Gets the GPU info from C call
154158
155159
:return:
156160
Total number of GPUs and total available memory
157161
(and optionally GPU usage)
158162
"""
159-
max_gpus = 128
163+
max_gpus = 16
160164
total_gpus = 0
161165
total_gpus_actual = 0
162166
which_gpus = []
163-
usages_tmp = np.empty(max_gpus, dtype=np.int32)
164-
total_mems_tmp = np.empty(max_gpus, dtype=np.uint64)
165-
free_mems_tmp = np.empty(max_gpus, dtype=np.uint64)
167+
usages_tmp = np.zeros(max_gpus, dtype=np.int32)
168+
total_mems_tmp = np.zeros(max_gpus, dtype=np.uint64)
169+
free_mems_tmp = np.zeros(max_gpus, dtype=np.uint64)
166170
# This 30 should be same as the gpu type in get_gpu_info_c
167171
gpu_types_tmp = [' ' * 30 for _ in range(64)]
168-
majors_tmp = np.empty(max_gpus, dtype=np.int32)
169-
minors_tmp = np.empty(max_gpus, dtype=np.int32)
172+
majors_tmp = np.zeros(max_gpus, dtype=np.int32)
173+
minors_tmp = np.zeros(max_gpus, dtype=np.int32)
174+
max_pids = 2000
175+
num_pids_tmp = np.zeros(max_pids, dtype=np.uint32)
176+
pids_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint32)
177+
usedGpuMemorys_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint64)
170178

171179
try:
172180
from ..libs.lib_utils import GPUlib
173181
lib = GPUlib().get()
174182

175183
total_gpus_actual = \
176184
lib.get_gpu_info_c(usages_tmp, total_mems_tmp, free_mems_tmp,
177-
gpu_types_tmp, majors_tmp, minors_tmp)
185+
gpu_types_tmp, majors_tmp, minors_tmp,
186+
num_pids_tmp, pids_tmp, usedGpuMemorys_tmp)
178187

179188
# This will drop the GPU count, but the returned usage
180189
total_gpus, which_gpus = cuda_vis_check(total_gpus_actual)
@@ -185,7 +194,10 @@ def get_gpu_info_c(return_usage=False, return_capability=False,
185194
# pylint: disable=broad-except
186195
except Exception as e:
187196
if verbose:
197+
import sys
198+
sys.stderr.write("Exception: %s" % str(e))
188199
print(e)
200+
sys.stdout.flush()
189201

190202
if return_capability or return_all:
191203
if list(minors_tmp)[0] == -1:
@@ -198,32 +210,49 @@ def get_gpu_info_c(return_usage=False, return_capability=False,
198210
usages_actual = np.resize(usages_tmp, total_gpus_actual)
199211
majors_actual = np.resize(majors_tmp, total_gpus_actual)
200212
minors_actual = np.resize(minors_tmp, total_gpus_actual)
213+
num_pids_actual = np.resize(num_pids_tmp, total_gpus_actual)
214+
pids_actual = np.resize(pids_tmp, total_gpus_actual * max_pids)
215+
usedGpuMemorys_actual = np.resize(usedGpuMemorys_tmp,
216+
total_gpus_actual * max_pids)
201217

202218
total_mems = np.resize(np.copy(total_mems_actual), total_gpus)
203219
free_mems = np.resize(np.copy(free_mems_actual), total_gpus)
204220
gpu_types = np.resize(np.copy(gpu_types_actual), total_gpus)
205221
usages = np.resize(np.copy(usages_actual), total_gpus)
206222
majors = np.resize(np.copy(majors_actual), total_gpus)
207223
minors = np.resize(np.copy(minors_actual), total_gpus)
224+
num_pids = np.resize(np.copy(num_pids_actual), total_gpus)
225+
pids = np.resize(np.copy(pids_actual), total_gpus * max_pids)
226+
usedGpuMemorys = np.resize(np.copy(usedGpuMemorys_actual),
227+
total_gpus * max_pids)
228+
208229
gpu_i = 0
209230
for j in range(total_gpus_actual):
210231
if j in which_gpus:
211232
total_mems[gpu_i] = total_mems_actual[j]
212233
free_mems[gpu_i] = free_mems_actual[j]
213234
gpu_types[gpu_i] = gpu_types_actual[j]
214235
usages[gpu_i] = usages_actual[j]
236+
minors[gpu_i] = minors_actual[j]
237+
majors[gpu_i] = majors_actual[j]
238+
num_pids[gpu_i] = num_pids_actual[j]
239+
pids[gpu_i] = pids_actual[j]
240+
usedGpuMemorys[gpu_i] = usedGpuMemorys_actual[j]
215241
gpu_i += 1
216-
217-
if return_all:
218-
return (total_gpus, total_mems,
219-
free_mems, gpu_types, usages, majors, minors)
220-
if return_usage and return_capability:
221-
return (total_gpus, total_mems, gpu_types, usages, majors, minors)
222-
if return_usage:
223-
return (total_gpus, total_mems, gpu_types, usages)
224-
if return_capability:
225-
return (total_gpus, total_mems, gpu_types, majors, minors)
226-
return (total_gpus, total_mems, gpu_types)
242+
pids = np.reshape(pids, (total_gpus, max_pids))
243+
usedGpuMemorys = np.reshape(usedGpuMemorys, (total_gpus, max_pids))
244+
245+
to_return = [total_gpus, total_mems, gpu_types]
246+
if return_all or return_usage:
247+
to_return.append(usages)
248+
if return_all or return_free_memory:
249+
to_return.append(free_mems)
250+
if return_all or return_capability:
251+
to_return.extend([majors, minors])
252+
if return_all or return_memory_by_pid:
253+
to_return.extend([num_pids, pids, usedGpuMemorys])
254+
255+
return tuple(to_return)
227256

228257

229258
def cudaresetdevice(gpu_id, n_gpus):

src/swig/util/gpu.i

+16-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
%{
33
extern int cudaresetdevice(int wDev, int nDev);
44
extern int get_compute_capability(int d_idx, int *major, int *minor, int *ratioperf);
5-
extern void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long long *gpu_total_memory, unsigned long long *gpu_free_memory, char **gpu_name, int *majors, int *minors);
5+
extern void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage,
6+
unsigned long long *gpu_total_memory, unsigned long long *gpu_free_memory,
7+
char **gpu_name,
8+
int *majors, int *minors,
9+
unsigned int *num_pids, unsigned int *pids, unsigned long long *usedGpuMemorys);
610
extern int cudaresetdevice_bare(void);
711
%}
812

@@ -38,15 +42,23 @@ extern int cudaresetdevice_bare(void);
3842
free((char *) $1);
3943
}
4044

41-
%apply int *OUTPUT {int *major, int *minor, int *ratioperf}
42-
%apply int *OUTPUT {unsigned int *n_gpus}
45+
%apply int *OUTPUT {int *major, int *minor, int *ratioperf};
46+
47+
%apply int *OUTPUT {unsigned int *n_gpus};
4348
%apply (int *INPLACE_ARRAY1) {int *gpu_percent_usage};
4449
%apply (int *INPLACE_ARRAY1) {int *majors};
4550
%apply (int *INPLACE_ARRAY1) {int *minors};
4651
%apply (unsigned long long *INPLACE_ARRAY1) {unsigned long long *gpu_total_memory};
4752
%apply (unsigned long long *INPLACE_ARRAY1) {unsigned long long *gpu_free_memory};
53+
%apply (unsigned int *INPLACE_ARRAY1) {unsigned int *num_pids};
54+
%apply (unsigned int *INPLACE_ARRAY1) {unsigned int *pids};
55+
%apply (unsigned long long *INPLACE_ARRAY1) {unsigned long long *usedGpuMemorys};
4856

4957
extern int cudaresetdevice(int wDev, int nDev);
5058
extern int get_compute_capability(int d_idx, int *major, int *minor, int *ratioperf);
51-
extern void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage, unsigned long long *gpu_total_memory, unsigned long long *gpu_free_memory, char **gpu_name, int *majors, int *minors);
59+
extern void get_gpu_info_c(unsigned int *n_gpus, int *gpu_percent_usage,
60+
unsigned long long *gpu_total_memory, unsigned long long *gpu_free_memory,
61+
char **gpu_name,
62+
int *majors, int *minors,
63+
unsigned int *num_pids, unsigned int *pids, unsigned long long *usedGpuMemorys);
5264
extern int cudaresetdevice_bare(void);

0 commit comments

Comments
 (0)