Skip to content

Commit 7426408

Browse files
committed
Add a test program, pio_noncontig.c
See detailed description at the beginning of the file.
1 parent d670f5f commit 7426408

File tree

1 file changed

+334
-0
lines changed

1 file changed

+334
-0
lines changed

tests/pio_noncontig.c

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2+
*
3+
* Copyright (C) 2024, Northwestern University
4+
*
5+
* This program tests collective write and read calls using a noncontiguous
6+
* user buffer datatype, which consists of 2 blocks separated by a gap. The
7+
* block size and gap size can be adjusted through command-line options.
8+
*
9+
* This program is to test the performance impact of many memcpy() called in
10+
* ROMIO's subroutine ADIOI_LUSTRE_Fill_send_buffer() when the user buffer is
11+
* non contiguous.
12+
*
13+
* The performance issue is discovered when running a PIO test program using
14+
* Lustre. When read/write requests are large and the Lustre striping size is
15+
* small, then the number of calls to memcpy() can become large, hurting the
16+
* performance.
17+
*
18+
* The original settings of PIO test program using the followings:
19+
* The number of MPI process clients = 2048
20+
* The number of I/O tasks (aggregators) = 16
21+
* The number of variables = 64
22+
* One extra small variable is written before 64 variables.
23+
* Each variables is a 2D array of size 58 x 10485762
24+
* Data partitioning is done along the 2nd dimension
25+
* Writes to all 64 subarrays are aggregated into one MPI_File_write call
26+
* User buffer consists of two separately allocated memory spaces.
27+
*
28+
* To compile:
29+
* % mpicc -O2 pio_noncontig.c -o pio_noncontig
30+
*
31+
* Example output of running 16 processes on a local Linux machine using UFS:
32+
* Note the 2 runs below differ only on whether option "-g 0" is used. Option
33+
* "-g 0" does not add a gap in the user buffer, making the buffer contiguous.
34+
*
35+
* % mpiexec -n 16 pio_noncontig -k 256 -c 32768 -w
36+
* Number of global variables = 64
37+
* Each global variable is of size 256 x 32768 bytes
38+
* Each local variable is of size 256 x 16 bytes
39+
* Gap between the first 2 variables is of size 16 bytes
40+
* Number of subarray types concatenated is 8192
41+
* Each process makes a request of amount 33554688 bytes
42+
* ROMIO hint set: cb_buffer_size = 1048576
43+
* ROMIO hint set: cb_nodes = 4
44+
* ---------------------------------------------------------
45+
* Time of collective write = 33.07 sec
46+
* ---------------------------------------------------------
47+
*
48+
* % mpiexec -n 16 pio_noncontig -k 256 -c 32768 -w -g 0
49+
* Number of global variables = 64
50+
* Each global variable is of size 256 x 32768 bytes
51+
* Each local variable is of size 256 x 16 bytes
52+
* Gap between the first 2 variables is of size 0 bytes
53+
* Number of subarray types concatenated is 8192
54+
* Each process makes a request of amount 33554688 bytes
55+
* ROMIO hint set: cb_buffer_size = 1048576
56+
* ROMIO hint set: cb_nodes = 4
57+
* ---------------------------------------------------------
58+
* Time of collective write = 8.27 sec
59+
* ---------------------------------------------------------
60+
*
61+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
62+
63+
#include <stdio.h>
64+
#include <stdlib.h>
65+
#include <unistd.h> /* getopt() */
66+
67+
#include <mpi.h>
68+
69+
#define NVARS 64 /* Number of variables */
70+
#define NROWS 58 /* Number of rows in each variable */
71+
#define NCOLS 1048576 /* Number of rows in each variable */
72+
#define NAGGR 16 /* Number of I/O aggregators */
73+
#define NCLIENTS 2048 /* Number of MPI process clients */
74+
#define GAP 16 /* gap size in the user buffer, mimic 2 malloc() */
75+
76+
#define cb_buffer_size "1048576"
77+
#define cb_nodes "4"
78+
79+
#define ERR \
80+
if (err != MPI_SUCCESS) { \
81+
int errorStringLen; \
82+
char errorString[MPI_MAX_ERROR_STRING]; \
83+
MPI_Error_string(err, errorString, &errorStringLen); \
84+
printf("Error at line %d: %s\n",__LINE__,errorString); \
85+
nerrs++; \
86+
goto err_out; \
87+
}
88+
89+
static void
90+
usage(char *argv0)
91+
{
92+
char *help =
93+
"Usage: %s [-hvrw | -n num | -k num | -c num | -g num | file_name]\n"
94+
" [-h] Print this help\n"
95+
" [-v] verbose mode\n"
96+
" [-w] performs write only (default: both write and read)\n"
97+
" [-r] performs read only (default: both write and read)\n"
98+
" [-n num] number of global variables (default: %d)\n"
99+
" [-k num] number of rows in each global variable (default: %d)\n"
100+
" [-c num] number of columns in each global variable (default: %d)\n"
101+
" [-g num] gap in bytes between first 2 blocks (default: %d)\n"
102+
" [file_name] output file name\n";
103+
fprintf(stderr, help, argv0, NVARS, NROWS, NCOLS, GAP);
104+
}
105+
106+
/*----< main() >------------------------------------------------------------*/
107+
int main(int argc, char **argv)
108+
{
109+
extern int optind;
110+
extern char *optarg;
111+
char filename[256];
112+
int i, err, nerrs=0, rank, nprocs, mode, verbose=0, nvars, nreqs;
113+
int gap, ncols_g, nrows, ncols, *blocklen, btype_size, ftype_size;
114+
int do_write, do_read;
115+
char *buf;
116+
double timing[2], max_timing[2];
117+
MPI_Aint lb, *displace, buf_ext, file_ext;
118+
MPI_Datatype bufType, fileType, *subTypes;
119+
MPI_File fh;
120+
MPI_Offset wlen;
121+
MPI_Status status;
122+
MPI_Info info = MPI_INFO_NULL;
123+
124+
MPI_Init(&argc,&argv);
125+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
126+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
127+
128+
nvars = NVARS;
129+
nrows = NROWS;
130+
ncols_g = NCOLS;
131+
gap = GAP;
132+
do_write = 1;
133+
do_read = 1;
134+
135+
/* get command-line arguments */
136+
while ((i = getopt(argc, argv, "hvwrn:k:c:g:")) != EOF)
137+
switch(i) {
138+
case 'v': verbose = 1;
139+
break;
140+
case 'n': nvars = atoi(optarg);
141+
break;
142+
case 'k': nrows = atoi(optarg);
143+
if (nrows < 0) {
144+
if (rank == 0)
145+
printf("Error: number of rows must >= 0\n");
146+
MPI_Finalize();
147+
return 1;
148+
}
149+
break;
150+
case 'c': ncols_g = atoi(optarg);
151+
if (ncols_g < 2048) {
152+
if (rank == 0)
153+
printf("Error: number of columns must >= %d\n",
154+
NCLIENTS);
155+
MPI_Finalize();
156+
return 1;
157+
}
158+
break;
159+
case 'g': gap = atoi(optarg);
160+
break;
161+
case 'w': do_read = 0;
162+
break;
163+
case 'r': do_write = 0;
164+
break;
165+
case 'h':
166+
default: if (rank==0) usage(argv[0]);
167+
MPI_Finalize();
168+
return 1;
169+
}
170+
if (argv[optind] == NULL)
171+
sprintf(filename, "%s.out", argv[0]);
172+
else
173+
snprintf(filename, 256, "%s", argv[optind]);
174+
175+
/* Calculate number of subarray requests each aggregator writes or reads.
176+
* Each original MPI process client forwards all its requests to one of
177+
* the I/O tasks. To run the original case, run 16 MPI processes.
178+
*/
179+
nreqs = nvars * NCLIENTS / nprocs;
180+
nreqs++; /* one small variable at the beginning */
181+
182+
/* Data partitioning is done along 2nd dimension */
183+
ncols = ncols_g / NCLIENTS;
184+
185+
wlen = (MPI_Offset)nrows * ncols * (nreqs - 1) + nrows;
186+
if (rank == 0) {
187+
printf("Number of global variables = %d\n", nvars);
188+
printf("Each global variable is of size %d x %d bytes\n",nrows,ncols_g);
189+
printf("Each local variable is of size %d x %d bytes\n",nrows,ncols);
190+
printf("Gap between the first 2 variables is of size %d bytes\n", gap);
191+
printf("Number of subarray types concatenated is %d\n", nreqs-1);
192+
printf("Each process makes a request of amount %lld bytes\n", wlen);
193+
printf("ROMIO hint set: cb_buffer_size = %s\n", cb_buffer_size);
194+
printf("ROMIO hint set: cb_nodes = %s\n", cb_nodes);
195+
}
196+
/* check 4-byte integer overflow */
197+
if (wlen > 2147483647) {
198+
if (rank == 0) {
199+
printf("Error: local write size %lld > INT_MAX.\n", wlen);
200+
printf(" Try increasing number of processes\n");
201+
printf(" or reduce the block size.\n");
202+
printf(" nrows=%d ncols=%d\n", nrows,ncols);
203+
}
204+
MPI_Abort(MPI_COMM_WORLD, 1);
205+
exit(1);
206+
}
207+
208+
blocklen = (int*) malloc(sizeof(int) * nreqs);
209+
displace = (MPI_Aint*) malloc(sizeof(MPI_Aint) * nreqs);
210+
211+
/* User buffer consists of two noncontiguous spaces. To mimic this, we
212+
* allocate one space but add a gap in between
213+
*/
214+
blocklen[0] = nrows; /* a small request of size nrows */
215+
blocklen[1] = nrows * ncols * (nreqs - 1);
216+
217+
displace[0] = 0;
218+
displace[1] = nrows + gap;
219+
220+
/* construct buffer datatype */
221+
err = MPI_Type_create_hindexed(2, blocklen, displace, MPI_BYTE, &bufType);
222+
ERR
223+
err = MPI_Type_commit(&bufType); ERR
224+
225+
/* allocate I/O buffer */
226+
err = MPI_Type_size(bufType, &btype_size); ERR
227+
err = MPI_Type_get_extent(bufType, &lb, &buf_ext); ERR
228+
buf = (char*) calloc(buf_ext, 1);
229+
230+
/* construct file type:
231+
* + there are nreqs subarrays, each uses a subarray datatype
232+
* + all subarray datatypes are concatenated into one to be used as fileview
233+
*/
234+
subTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nreqs);
235+
236+
/* first is the small variable at the beginning of the file */
237+
err = MPI_Type_contiguous(nrows, MPI_BYTE, &subTypes[0]); ERR
238+
blocklen[0] = 1;
239+
displace[0] = nrows * rank;
240+
241+
for (i=1; i<nreqs; i++) {
242+
int gsizes[2], subsizes[2], starts[2];
243+
244+
gsizes[0] = nrows;
245+
gsizes[1] = ncols * nprocs;
246+
subsizes[0] = nrows;
247+
subsizes[1] = ncols;
248+
starts[0] = 0;
249+
starts[1] = ncols * rank;
250+
err = MPI_Type_create_subarray(2, gsizes, subsizes, starts,
251+
MPI_ORDER_C, MPI_BYTE, &subTypes[i]); ERR
252+
blocklen[i] = 1;
253+
displace[i] = (MPI_Aint)nrows * nprocs
254+
+ (MPI_Aint)gsizes[0] * gsizes[1] * (i - 1);
255+
}
256+
257+
/* concatenate all subTypes into one datatype */
258+
err = MPI_Type_create_struct(nreqs, blocklen, displace, subTypes,
259+
&fileType); ERR
260+
err = MPI_Type_commit(&fileType); ERR
261+
262+
for (i=0; i<nreqs; i++) {
263+
err = MPI_Type_free(&subTypes[i]); ERR
264+
}
265+
free(subTypes);
266+
free(displace);
267+
free(blocklen);
268+
269+
/* check datatype extent and size */
270+
err = MPI_Type_get_extent(fileType, &lb, &file_ext); ERR
271+
err = MPI_Type_size(fileType, &ftype_size); ERR
272+
273+
if (ftype_size != btype_size) {
274+
if (rank == 0)
275+
printf("Error: sizes of fileType and bufType mismatch (%d != %d)\n",
276+
ftype_size, btype_size);
277+
MPI_Abort(MPI_COMM_WORLD, 1);
278+
exit(1);
279+
}
280+
if (verbose)
281+
printf("%2d: buf_ext=%ld btype_size=%d file_ext=%ld ftype_size=%d\n",
282+
rank,buf_ext,btype_size,file_ext,ftype_size);
283+
284+
/* set hints to mimic Lustre striping size of 1MB and count of 4 on a UFS */
285+
MPI_Info_create(&info);
286+
MPI_Info_set(info, "cb_config_list", "*:*");
287+
MPI_Info_set(info, "cb_buffer_size", cb_buffer_size);
288+
MPI_Info_set(info, "cb_nodes", cb_nodes);
289+
290+
mode = MPI_MODE_CREATE | MPI_MODE_RDWR;
291+
err = MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh); ERR
292+
293+
err = MPI_File_set_view(fh, 0, MPI_BYTE, fileType, "native", MPI_INFO_NULL);
294+
ERR
295+
296+
MPI_Info_free(&info);
297+
298+
/* write to the file */
299+
if (do_write) {
300+
MPI_Barrier(MPI_COMM_WORLD);
301+
timing[0] = MPI_Wtime();
302+
err = MPI_File_write_at_all(fh, 0, buf, 1, bufType, &status); ERR
303+
timing[0] = MPI_Wtime() - timing[0];
304+
}
305+
306+
/* read from the file */
307+
if (do_read) {
308+
MPI_Barrier(MPI_COMM_WORLD);
309+
timing[1] = MPI_Wtime();
310+
err = MPI_File_read_at_all(fh, 0, buf, 1, bufType, &status); ERR
311+
timing[1] = MPI_Wtime() - timing[1];
312+
}
313+
314+
err = MPI_File_close(&fh); ERR
315+
316+
err = MPI_Type_free(&fileType); ERR
317+
err = MPI_Type_free(&bufType); ERR
318+
free(buf);
319+
320+
MPI_Reduce(timing, max_timing, 2, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
321+
if (rank == 0) {
322+
printf("---------------------------------------------------------\n");
323+
if (do_write)
324+
printf("Time of collective write = %.2f sec\n", max_timing[0]);
325+
if (do_read)
326+
printf("Time of collective read = %.2f sec\n", max_timing[1]);
327+
printf("---------------------------------------------------------\n");
328+
}
329+
330+
err_out:
331+
MPI_Finalize();
332+
return (nerrs > 0);
333+
}
334+

0 commit comments

Comments
 (0)