|
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * |
| 2 | + * |
| 3 | + * Copyright (C) 2024, Northwestern University |
| 4 | + * |
| 5 | + * This program tests collective write and read calls using a noncontiguous |
| 6 | + * user buffer datatype, which consists of 2 blocks separated by a gap. The |
| 7 | + * block size and gap size can be adjusted through command-line options. |
| 8 | + * |
| 9 | + * This program is to test the performance impact of many memcpy() called in |
| 10 | + * ROMIO's subroutine ADIOI_LUSTRE_Fill_send_buffer() when the user buffer is |
| 11 | + * non contiguous. |
| 12 | + * |
| 13 | + * The performance issue is discovered when running a PIO test program using |
| 14 | + * Lustre. When read/write requests are large and the Lustre striping size is |
| 15 | + * small, then the number of calls to memcpy() can become large, hurting the |
| 16 | + * performance. |
| 17 | + * |
| 18 | + * The original settings of PIO test program using the followings: |
| 19 | + * The number of MPI process clients = 2048 |
| 20 | + * The number of I/O tasks (aggregators) = 16 |
| 21 | + * The number of variables = 64 |
| 22 | + * One extra small variable is written before 64 variables. |
| 23 | + * Each variables is a 2D array of size 58 x 10485762 |
| 24 | + * Data partitioning is done along the 2nd dimension |
| 25 | + * Writes to all 64 subarrays are aggregated into one MPI_File_write call |
| 26 | + * User buffer consists of two separately allocated memory spaces. |
| 27 | + * |
| 28 | + * To compile: |
| 29 | + * % mpicc -O2 pio_noncontig.c -o pio_noncontig |
| 30 | + * |
| 31 | + * Example output of running 16 processes on a local Linux machine using UFS: |
| 32 | + * Note the 2 runs below differ only on whether option "-g 0" is used. Option |
| 33 | + * "-g 0" does not add a gap in the user buffer, making the buffer contiguous. |
| 34 | + * |
| 35 | + * % mpiexec -n 16 pio_noncontig -k 256 -c 32768 -w |
| 36 | + * Number of global variables = 64 |
| 37 | + * Each global variable is of size 256 x 32768 bytes |
| 38 | + * Each local variable is of size 256 x 16 bytes |
| 39 | + * Gap between the first 2 variables is of size 16 bytes |
| 40 | + * Number of subarray types concatenated is 8192 |
| 41 | + * Each process makes a request of amount 33554688 bytes |
| 42 | + * ROMIO hint set: cb_buffer_size = 1048576 |
| 43 | + * ROMIO hint set: cb_nodes = 4 |
| 44 | + * --------------------------------------------------------- |
| 45 | + * Time of collective write = 33.07 sec |
| 46 | + * --------------------------------------------------------- |
| 47 | + * |
| 48 | + * % mpiexec -n 16 pio_noncontig -k 256 -c 32768 -w -g 0 |
| 49 | + * Number of global variables = 64 |
| 50 | + * Each global variable is of size 256 x 32768 bytes |
| 51 | + * Each local variable is of size 256 x 16 bytes |
| 52 | + * Gap between the first 2 variables is of size 0 bytes |
| 53 | + * Number of subarray types concatenated is 8192 |
| 54 | + * Each process makes a request of amount 33554688 bytes |
| 55 | + * ROMIO hint set: cb_buffer_size = 1048576 |
| 56 | + * ROMIO hint set: cb_nodes = 4 |
| 57 | + * --------------------------------------------------------- |
| 58 | + * Time of collective write = 8.27 sec |
| 59 | + * --------------------------------------------------------- |
| 60 | + * |
| 61 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 62 | + |
| 63 | +#include <stdio.h> |
| 64 | +#include <stdlib.h> |
| 65 | +#include <unistd.h> /* getopt() */ |
| 66 | + |
| 67 | +#include <mpi.h> |
| 68 | + |
| 69 | +#define NVARS 64 /* Number of variables */ |
| 70 | +#define NROWS 58 /* Number of rows in each variable */ |
| 71 | +#define NCOLS 1048576 /* Number of rows in each variable */ |
| 72 | +#define NAGGR 16 /* Number of I/O aggregators */ |
| 73 | +#define NCLIENTS 2048 /* Number of MPI process clients */ |
| 74 | +#define GAP 16 /* gap size in the user buffer, mimic 2 malloc() */ |
| 75 | + |
| 76 | +#define cb_buffer_size "1048576" |
| 77 | +#define cb_nodes "4" |
| 78 | + |
| 79 | +#define ERR \ |
| 80 | + if (err != MPI_SUCCESS) { \ |
| 81 | + int errorStringLen; \ |
| 82 | + char errorString[MPI_MAX_ERROR_STRING]; \ |
| 83 | + MPI_Error_string(err, errorString, &errorStringLen); \ |
| 84 | + printf("Error at line %d: %s\n",__LINE__,errorString); \ |
| 85 | + nerrs++; \ |
| 86 | + goto err_out; \ |
| 87 | + } |
| 88 | + |
| 89 | +static void |
| 90 | +usage(char *argv0) |
| 91 | +{ |
| 92 | + char *help = |
| 93 | + "Usage: %s [-hvrw | -n num | -k num | -c num | -g num | file_name]\n" |
| 94 | + " [-h] Print this help\n" |
| 95 | + " [-v] verbose mode\n" |
| 96 | + " [-w] performs write only (default: both write and read)\n" |
| 97 | + " [-r] performs read only (default: both write and read)\n" |
| 98 | + " [-n num] number of global variables (default: %d)\n" |
| 99 | + " [-k num] number of rows in each global variable (default: %d)\n" |
| 100 | + " [-c num] number of columns in each global variable (default: %d)\n" |
| 101 | + " [-g num] gap in bytes between first 2 blocks (default: %d)\n" |
| 102 | + " [file_name] output file name\n"; |
| 103 | + fprintf(stderr, help, argv0, NVARS, NROWS, NCOLS, GAP); |
| 104 | +} |
| 105 | + |
| 106 | +/*----< main() >------------------------------------------------------------*/ |
| 107 | +int main(int argc, char **argv) |
| 108 | +{ |
| 109 | + extern int optind; |
| 110 | + extern char *optarg; |
| 111 | + char filename[256]; |
| 112 | + int i, err, nerrs=0, rank, nprocs, mode, verbose=0, nvars, nreqs; |
| 113 | + int gap, ncols_g, nrows, ncols, *blocklen, btype_size, ftype_size; |
| 114 | + int do_write, do_read; |
| 115 | + char *buf; |
| 116 | + double timing[2], max_timing[2]; |
| 117 | + MPI_Aint lb, *displace, buf_ext, file_ext; |
| 118 | + MPI_Datatype bufType, fileType, *subTypes; |
| 119 | + MPI_File fh; |
| 120 | + MPI_Offset wlen; |
| 121 | + MPI_Status status; |
| 122 | + MPI_Info info = MPI_INFO_NULL; |
| 123 | + |
| 124 | + MPI_Init(&argc,&argv); |
| 125 | + MPI_Comm_rank(MPI_COMM_WORLD, &rank); |
| 126 | + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); |
| 127 | + |
| 128 | + nvars = NVARS; |
| 129 | + nrows = NROWS; |
| 130 | + ncols_g = NCOLS; |
| 131 | + gap = GAP; |
| 132 | + do_write = 1; |
| 133 | + do_read = 1; |
| 134 | + |
| 135 | + /* get command-line arguments */ |
| 136 | + while ((i = getopt(argc, argv, "hvwrn:k:c:g:")) != EOF) |
| 137 | + switch(i) { |
| 138 | + case 'v': verbose = 1; |
| 139 | + break; |
| 140 | + case 'n': nvars = atoi(optarg); |
| 141 | + break; |
| 142 | + case 'k': nrows = atoi(optarg); |
| 143 | + if (nrows < 0) { |
| 144 | + if (rank == 0) |
| 145 | + printf("Error: number of rows must >= 0\n"); |
| 146 | + MPI_Finalize(); |
| 147 | + return 1; |
| 148 | + } |
| 149 | + break; |
| 150 | + case 'c': ncols_g = atoi(optarg); |
| 151 | + if (ncols_g < 2048) { |
| 152 | + if (rank == 0) |
| 153 | + printf("Error: number of columns must >= %d\n", |
| 154 | + NCLIENTS); |
| 155 | + MPI_Finalize(); |
| 156 | + return 1; |
| 157 | + } |
| 158 | + break; |
| 159 | + case 'g': gap = atoi(optarg); |
| 160 | + break; |
| 161 | + case 'w': do_read = 0; |
| 162 | + break; |
| 163 | + case 'r': do_write = 0; |
| 164 | + break; |
| 165 | + case 'h': |
| 166 | + default: if (rank==0) usage(argv[0]); |
| 167 | + MPI_Finalize(); |
| 168 | + return 1; |
| 169 | + } |
| 170 | + if (argv[optind] == NULL) |
| 171 | + sprintf(filename, "%s.out", argv[0]); |
| 172 | + else |
| 173 | + snprintf(filename, 256, "%s", argv[optind]); |
| 174 | + |
| 175 | + /* Calculate number of subarray requests each aggregator writes or reads. |
| 176 | + * Each original MPI process client forwards all its requests to one of |
| 177 | + * the I/O tasks. To run the original case, run 16 MPI processes. |
| 178 | + */ |
| 179 | + nreqs = nvars * NCLIENTS / nprocs; |
| 180 | + nreqs++; /* one small variable at the beginning */ |
| 181 | + |
| 182 | + /* Data partitioning is done along 2nd dimension */ |
| 183 | + ncols = ncols_g / NCLIENTS; |
| 184 | + |
| 185 | + wlen = (MPI_Offset)nrows * ncols * (nreqs - 1) + nrows; |
| 186 | + if (rank == 0) { |
| 187 | + printf("Number of global variables = %d\n", nvars); |
| 188 | + printf("Each global variable is of size %d x %d bytes\n",nrows,ncols_g); |
| 189 | + printf("Each local variable is of size %d x %d bytes\n",nrows,ncols); |
| 190 | + printf("Gap between the first 2 variables is of size %d bytes\n", gap); |
| 191 | + printf("Number of subarray types concatenated is %d\n", nreqs-1); |
| 192 | + printf("Each process makes a request of amount %lld bytes\n", wlen); |
| 193 | + printf("ROMIO hint set: cb_buffer_size = %s\n", cb_buffer_size); |
| 194 | + printf("ROMIO hint set: cb_nodes = %s\n", cb_nodes); |
| 195 | + } |
| 196 | + /* check 4-byte integer overflow */ |
| 197 | + if (wlen > 2147483647) { |
| 198 | + if (rank == 0) { |
| 199 | + printf("Error: local write size %lld > INT_MAX.\n", wlen); |
| 200 | + printf(" Try increasing number of processes\n"); |
| 201 | + printf(" or reduce the block size.\n"); |
| 202 | + printf(" nrows=%d ncols=%d\n", nrows,ncols); |
| 203 | + } |
| 204 | + MPI_Abort(MPI_COMM_WORLD, 1); |
| 205 | + exit(1); |
| 206 | + } |
| 207 | + |
| 208 | + blocklen = (int*) malloc(sizeof(int) * nreqs); |
| 209 | + displace = (MPI_Aint*) malloc(sizeof(MPI_Aint) * nreqs); |
| 210 | + |
| 211 | + /* User buffer consists of two noncontiguous spaces. To mimic this, we |
| 212 | + * allocate one space but add a gap in between |
| 213 | + */ |
| 214 | + blocklen[0] = nrows; /* a small request of size nrows */ |
| 215 | + blocklen[1] = nrows * ncols * (nreqs - 1); |
| 216 | + |
| 217 | + displace[0] = 0; |
| 218 | + displace[1] = nrows + gap; |
| 219 | + |
| 220 | + /* construct buffer datatype */ |
| 221 | + err = MPI_Type_create_hindexed(2, blocklen, displace, MPI_BYTE, &bufType); |
| 222 | + ERR |
| 223 | + err = MPI_Type_commit(&bufType); ERR |
| 224 | + |
| 225 | + /* allocate I/O buffer */ |
| 226 | + err = MPI_Type_size(bufType, &btype_size); ERR |
| 227 | + err = MPI_Type_get_extent(bufType, &lb, &buf_ext); ERR |
| 228 | + buf = (char*) calloc(buf_ext, 1); |
| 229 | + |
| 230 | + /* construct file type: |
| 231 | + * + there are nreqs subarrays, each uses a subarray datatype |
| 232 | + * + all subarray datatypes are concatenated into one to be used as fileview |
| 233 | + */ |
| 234 | + subTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nreqs); |
| 235 | + |
| 236 | + /* first is the small variable at the beginning of the file */ |
| 237 | + err = MPI_Type_contiguous(nrows, MPI_BYTE, &subTypes[0]); ERR |
| 238 | + blocklen[0] = 1; |
| 239 | + displace[0] = nrows * rank; |
| 240 | + |
| 241 | + for (i=1; i<nreqs; i++) { |
| 242 | + int gsizes[2], subsizes[2], starts[2]; |
| 243 | + |
| 244 | + gsizes[0] = nrows; |
| 245 | + gsizes[1] = ncols * nprocs; |
| 246 | + subsizes[0] = nrows; |
| 247 | + subsizes[1] = ncols; |
| 248 | + starts[0] = 0; |
| 249 | + starts[1] = ncols * rank; |
| 250 | + err = MPI_Type_create_subarray(2, gsizes, subsizes, starts, |
| 251 | + MPI_ORDER_C, MPI_BYTE, &subTypes[i]); ERR |
| 252 | + blocklen[i] = 1; |
| 253 | + displace[i] = (MPI_Aint)nrows * nprocs |
| 254 | + + (MPI_Aint)gsizes[0] * gsizes[1] * (i - 1); |
| 255 | + } |
| 256 | + |
| 257 | + /* concatenate all subTypes into one datatype */ |
| 258 | + err = MPI_Type_create_struct(nreqs, blocklen, displace, subTypes, |
| 259 | + &fileType); ERR |
| 260 | + err = MPI_Type_commit(&fileType); ERR |
| 261 | + |
| 262 | + for (i=0; i<nreqs; i++) { |
| 263 | + err = MPI_Type_free(&subTypes[i]); ERR |
| 264 | + } |
| 265 | + free(subTypes); |
| 266 | + free(displace); |
| 267 | + free(blocklen); |
| 268 | + |
| 269 | + /* check datatype extent and size */ |
| 270 | + err = MPI_Type_get_extent(fileType, &lb, &file_ext); ERR |
| 271 | + err = MPI_Type_size(fileType, &ftype_size); ERR |
| 272 | + |
| 273 | + if (ftype_size != btype_size) { |
| 274 | + if (rank == 0) |
| 275 | + printf("Error: sizes of fileType and bufType mismatch (%d != %d)\n", |
| 276 | + ftype_size, btype_size); |
| 277 | + MPI_Abort(MPI_COMM_WORLD, 1); |
| 278 | + exit(1); |
| 279 | + } |
| 280 | + if (verbose) |
| 281 | + printf("%2d: buf_ext=%ld btype_size=%d file_ext=%ld ftype_size=%d\n", |
| 282 | + rank,buf_ext,btype_size,file_ext,ftype_size); |
| 283 | + |
| 284 | + /* set hints to mimic Lustre striping size of 1MB and count of 4 on a UFS */ |
| 285 | + MPI_Info_create(&info); |
| 286 | + MPI_Info_set(info, "cb_config_list", "*:*"); |
| 287 | + MPI_Info_set(info, "cb_buffer_size", cb_buffer_size); |
| 288 | + MPI_Info_set(info, "cb_nodes", cb_nodes); |
| 289 | + |
| 290 | + mode = MPI_MODE_CREATE | MPI_MODE_RDWR; |
| 291 | + err = MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh); ERR |
| 292 | + |
| 293 | + err = MPI_File_set_view(fh, 0, MPI_BYTE, fileType, "native", MPI_INFO_NULL); |
| 294 | + ERR |
| 295 | + |
| 296 | + MPI_Info_free(&info); |
| 297 | + |
| 298 | + /* write to the file */ |
| 299 | + if (do_write) { |
| 300 | + MPI_Barrier(MPI_COMM_WORLD); |
| 301 | + timing[0] = MPI_Wtime(); |
| 302 | + err = MPI_File_write_at_all(fh, 0, buf, 1, bufType, &status); ERR |
| 303 | + timing[0] = MPI_Wtime() - timing[0]; |
| 304 | + } |
| 305 | + |
| 306 | + /* read from the file */ |
| 307 | + if (do_read) { |
| 308 | + MPI_Barrier(MPI_COMM_WORLD); |
| 309 | + timing[1] = MPI_Wtime(); |
| 310 | + err = MPI_File_read_at_all(fh, 0, buf, 1, bufType, &status); ERR |
| 311 | + timing[1] = MPI_Wtime() - timing[1]; |
| 312 | + } |
| 313 | + |
| 314 | + err = MPI_File_close(&fh); ERR |
| 315 | + |
| 316 | + err = MPI_Type_free(&fileType); ERR |
| 317 | + err = MPI_Type_free(&bufType); ERR |
| 318 | + free(buf); |
| 319 | + |
| 320 | + MPI_Reduce(timing, max_timing, 2, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); |
| 321 | + if (rank == 0) { |
| 322 | + printf("---------------------------------------------------------\n"); |
| 323 | + if (do_write) |
| 324 | + printf("Time of collective write = %.2f sec\n", max_timing[0]); |
| 325 | + if (do_read) |
| 326 | + printf("Time of collective read = %.2f sec\n", max_timing[1]); |
| 327 | + printf("---------------------------------------------------------\n"); |
| 328 | + } |
| 329 | + |
| 330 | +err_out: |
| 331 | + MPI_Finalize(); |
| 332 | + return (nerrs > 0); |
| 333 | +} |
| 334 | + |
0 commit comments