Skip to content

Commit cecb6e1

Browse files
committed
Use MPI independent I/O to move data when header grows
* Because moving data is done in the unit of 16 MB per process and there is no interleaved access among all processes, independent I/O should perform faster. * Use collective I/O only when the number of processes is > 1 and MPI-IO hint "romio_no_indep_rw" is set to true.
1 parent 0dc7f9d commit cecb6e1

File tree

1 file changed

+16
-29
lines changed

1 file changed

+16
-29
lines changed

src/drivers/ncmpio/ncmpio_enddef.c

Lines changed: 16 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@
3737
#define MOVE_UNIT 16777216
3838

3939
/*----< move_file_block() >-------------------------------------------------*/
40-
/* Call MPI independent I/O subroutines to move data */
40+
/* Call MPI I/O subroutines to move data */
4141
static int
4242
move_file_block(NC *ncp,
4343
MPI_Offset to, /* destination starting file offset */
4444
MPI_Offset from, /* source starting file offset */
4545
MPI_Offset nbytes) /* amount to be moved */
4646
{
47-
int rank, nprocs, mpireturn, err, status=NC_NOERR;
47+
int rank, nprocs, mpireturn, err, status=NC_NOERR, do_coll;
4848
void *buf;
4949
size_t num_moves, mv_amnt, p_units;
5050
MPI_Offset off_last, off_from, off_to;
@@ -66,6 +66,13 @@ move_file_block(NC *ncp,
6666
TRACE_IO(MPI_File_set_view)(fh, 0, MPI_BYTE, MPI_BYTE, "native",
6767
MPI_INFO_NULL);
6868

69+
/* Use MPI collective I/O subroutines to move data, only if nproc > 1 and
70+
* MPI-IO hint "romio_no_indep_rw" is set to true. Otherwise, use MPI
71+
* independent I/O subroutines, as the data partitioned among processes are
72+
* not interleaved and thus need no collective I/O.
73+
*/
74+
do_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL));
75+
6976
/* buf will be used as a temporal buffer to move data in chunks, i.e.
7077
* read a chunk and later write to the new location
7178
*/
@@ -103,17 +110,15 @@ move_file_block(NC *ncp,
103110
chunk_size = 0;
104111
}
105112

106-
/* each rank moves data of size chunk_size from off_from to off_to */
107-
108113
/* explicitly initialize mpistatus object to 0. For zero-length read,
109114
* MPI_Get_count may report incorrect result for some MPICH version,
110115
* due to the uninitialized MPI_Status object passed to MPI-IO calls.
111116
* Thus we initialize it above to work around.
112117
*/
113118
memset(&mpistatus, 0, sizeof(MPI_Status));
114119

115-
/* read the original data at off_from for amount of chunk_size */
116-
if (ncp->nprocs > 1)
120+
/* read from file at off_from for amount of chunk_size */
121+
if (do_coll)
117122
TRACE_IO(MPI_File_read_at_all)(fh, off_from, buf, chunk_size,
118123
MPI_BYTE, &mpistatus);
119124
else
@@ -132,11 +137,6 @@ move_file_block(NC *ncp,
132137
* work around. See MPICH ticket:
133138
* https://trac.mpich.org/projects/mpich/ticket/2332
134139
*
135-
* Note we cannot set chunk_size to get_size, as the actual size
136-
* read from a file may be less than chunk_size. Because we are
137-
* moving whatever read to a new file offset, we must use the
138-
* amount actually read to call MPI_File_write_at_all below.
139-
*
140140
* Update the number of bytes read since file open.
141141
* Because each rank reads and writes no more than one chunk_size
142142
* at a time and chunk_size is < NC_MAX_INT, it is OK to call
@@ -149,31 +149,18 @@ move_file_block(NC *ncp,
149149
/* to prevent from one rank's write run faster than other's read */
150150
if (ncp->nprocs > 1) MPI_Barrier(ncp->comm);
151151

152-
/* write to new location at off_to for amount of chunk_size
153-
*
154-
* Ideally, we should write the amount of get_size returned from a call
155-
* to MPI_Get_count in the below MPI write. This is in case some
156-
* variables are defined but never been written. The value returned by
157-
* MPI_Get_count is supposed to be the actual amount read by the MPI
158-
* read call. If partial data (or none) is available for read, then we
159-
* should just write that amount. Note this MPI write is collective,
160-
* and thus all processes must participate the call even if get_size
161-
* is 0. However, in some MPICH versions MPI_Get_count fails to report
162-
* the correct value due to an internal error that fails to initialize
163-
* the MPI_Status object. Therefore, the solution can be either to
164-
* explicitly initialize the status object to zeros, or to just use
165-
* chunk_size for write. Note that the latter will write the variables
166-
* that have not been written before. Below uses the former option.
167-
*/
168-
169152
/* explicitly initialize mpistatus object to 0. For zero-length read,
170153
* MPI_Get_count may report incorrect result for some MPICH version,
171154
* due to the uninitialized MPI_Status object passed to MPI-IO calls.
172155
* Thus we initialize it above to work around.
173156
*/
174157
memset(&mpistatus, 0, sizeof(MPI_Status));
175158

176-
if (ncp->nprocs > 1)
159+
/* Write to new location at off_to for amount of get_size. Assuming the
160+
* call to MPI_Get_count() above returns the actual amount of data read
161+
* from the file, i.e. get_size.
162+
*/
163+
if (do_coll)
177164
TRACE_IO(MPI_File_write_at_all)(fh, off_to, buf,
178165
get_size /* NOT chunk_size */,
179166
MPI_BYTE, &mpistatus);

0 commit comments

Comments
 (0)