3737#define MOVE_UNIT 16777216
3838
3939/*----< move_file_block() >-------------------------------------------------*/
40- /* Call MPI independent I/O subroutines to move data */
40+ /* Call MPI I/O subroutines to move data */
4141static int
4242move_file_block (NC * ncp ,
4343 MPI_Offset to , /* destination starting file offset */
4444 MPI_Offset from , /* source starting file offset */
4545 MPI_Offset nbytes ) /* amount to be moved */
4646{
47- int rank , nprocs , mpireturn , err , status = NC_NOERR ;
47+ int rank , nprocs , mpireturn , err , status = NC_NOERR , do_coll ;
4848 void * buf ;
4949 size_t num_moves , mv_amnt , p_units ;
5050 MPI_Offset off_last , off_from , off_to ;
@@ -66,6 +66,13 @@ move_file_block(NC *ncp,
6666 TRACE_IO (MPI_File_set_view )(fh , 0 , MPI_BYTE , MPI_BYTE , "native" ,
6767 MPI_INFO_NULL );
6868
69+ /* Use MPI collective I/O subroutines to move data, only if nproc > 1 and
70+ * MPI-IO hint "romio_no_indep_rw" is set to true. Otherwise, use MPI
71+ * independent I/O subroutines, as the data partitioned among processes are
72+ * not interleaved and thus need no collective I/O.
73+ */
74+ do_coll = (ncp -> nprocs > 1 && fIsSet (ncp -> flags , NC_HCOLL ));
75+
6976 /* buf will be used as a temporal buffer to move data in chunks, i.e.
7077 * read a chunk and later write to the new location
7178 */
@@ -103,17 +110,15 @@ move_file_block(NC *ncp,
103110 chunk_size = 0 ;
104111 }
105112
106- /* each rank moves data of size chunk_size from off_from to off_to */
107-
108113 /* explicitly initialize mpistatus object to 0. For zero-length read,
109114 * MPI_Get_count may report incorrect result for some MPICH version,
110115 * due to the uninitialized MPI_Status object passed to MPI-IO calls.
111116 * Thus we initialize it above to work around.
112117 */
113118 memset (& mpistatus , 0 , sizeof (MPI_Status ));
114119
115- /* read the original data at off_from for amount of chunk_size */
116- if (ncp -> nprocs > 1 )
120+ /* read from file at off_from for amount of chunk_size */
121+ if (do_coll )
117122 TRACE_IO (MPI_File_read_at_all )(fh , off_from , buf , chunk_size ,
118123 MPI_BYTE , & mpistatus );
119124 else
@@ -132,11 +137,6 @@ move_file_block(NC *ncp,
132137 * work around. See MPICH ticket:
133138 * https://trac.mpich.org/projects/mpich/ticket/2332
134139 *
135- * Note we cannot set chunk_size to get_size, as the actual size
136- * read from a file may be less than chunk_size. Because we are
137- * moving whatever read to a new file offset, we must use the
138- * amount actually read to call MPI_File_write_at_all below.
139- *
140140 * Update the number of bytes read since file open.
141141 * Because each rank reads and writes no more than one chunk_size
142142 * at a time and chunk_size is < NC_MAX_INT, it is OK to call
@@ -149,31 +149,18 @@ move_file_block(NC *ncp,
149149 /* to prevent from one rank's write run faster than other's read */
150150 if (ncp -> nprocs > 1 ) MPI_Barrier (ncp -> comm );
151151
152- /* write to new location at off_to for amount of chunk_size
153- *
154- * Ideally, we should write the amount of get_size returned from a call
155- * to MPI_Get_count in the below MPI write. This is in case some
156- * variables are defined but never been written. The value returned by
157- * MPI_Get_count is supposed to be the actual amount read by the MPI
158- * read call. If partial data (or none) is available for read, then we
159- * should just write that amount. Note this MPI write is collective,
160- * and thus all processes must participate the call even if get_size
161- * is 0. However, in some MPICH versions MPI_Get_count fails to report
162- * the correct value due to an internal error that fails to initialize
163- * the MPI_Status object. Therefore, the solution can be either to
164- * explicitly initialize the status object to zeros, or to just use
165- * chunk_size for write. Note that the latter will write the variables
166- * that have not been written before. Below uses the former option.
167- */
168-
169152 /* explicitly initialize mpistatus object to 0. For zero-length read,
170153 * MPI_Get_count may report incorrect result for some MPICH version,
171154 * due to the uninitialized MPI_Status object passed to MPI-IO calls.
172155 * Thus we initialize it above to work around.
173156 */
174157 memset (& mpistatus , 0 , sizeof (MPI_Status ));
175158
176- if (ncp -> nprocs > 1 )
159+ /* Write to new location at off_to for amount of get_size. Assuming the
160+ * call to MPI_Get_count() above returns the actual amount of data read
161+ * from the file, i.e. get_size.
162+ */
163+ if (do_coll )
177164 TRACE_IO (MPI_File_write_at_all )(fh , off_to , buf ,
178165 get_size /* NOT chunk_size */ ,
179166 MPI_BYTE , & mpistatus );
0 commit comments