@@ -17,17 +17,14 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
17
17
18
18
decodeFunction = ' parallel.cluster.generic.communicatingDecodeFcn' ;
19
19
20
- if cluster .HasSharedFilesystem
21
- error(' parallelexamples:GenericGridEngine:NotNonSharedFileSystem' , ...
22
- ' The function %s is for use with nonshared filesystems.' , currFilename )
23
- end
24
-
25
20
if ~strcmpi(cluster .OperatingSystem , ' unix' )
26
21
error(' parallelexamples:GenericGridEngine:UnsupportedOS' , ...
27
22
' The function %s only supports clusters with unix OS.' , currFilename )
28
23
end
29
24
30
- remoteConnection = getRemoteConnection(cluster );
25
+ if isprop(cluster .AdditionalProperties , ' ClusterHost' )
26
+ remoteConnection = getRemoteConnection(cluster );
27
+ end
31
28
32
29
% Determine the debug setting. Setting to true makes the MATLAB workers
33
30
% output additional logging. If EnableDebug is set in the cluster object's
@@ -65,11 +62,16 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
65
62
% The job specific environment variables
66
63
% Remove leading and trailing whitespace from the MATLAB arguments
67
64
matlabArguments = strtrim(environmentProperties .MatlabArguments );
68
- % Where on the remote filesystem to store job output
69
- storageLocation = remoteConnection .JobStorageLocation ;
70
- % If the RemoteJobStorageLocation ends with a space, add a slash to ensure it is respected
71
- if endsWith(storageLocation , ' ' )
72
- storageLocation = [storageLocation , fileSeparator ];
65
+
66
+ % Where the workers store job output
67
+ if cluster .HasSharedFilesystem
68
+ storageLocation = environmentProperties .StorageLocation ;
69
+ else
70
+ storageLocation = remoteConnection .JobStorageLocation ;
71
+ % If the RemoteJobStorageLocation ends with a space, add a slash to ensure it is respected
72
+ if endsWith(storageLocation , ' ' )
73
+ storageLocation = [storageLocation , fileSeparator ];
74
+ end
73
75
end
74
76
variables = {' PARALLEL_SERVER_DECODE_FUNCTION' , decodeFunction ; ...
75
77
' PARALLEL_SERVER_STORAGE_CONSTRUCTOR' , environmentProperties .StorageConstructor ; ...
@@ -93,33 +95,37 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
93
95
nonEmptyValues = cellfun(@(x ) ~isempty(strtrim(x )), variables(: ,2 ));
94
96
variables = variables(nonEmptyValues , : );
95
97
96
- % The local job directory
98
+ % The job directory as accessed by this machine
97
99
localJobDirectory = cluster .getJobFolder(job );
98
- % How we refer to the job directory on the cluster
99
- remoteJobDirectory = remoteConnection .getRemoteJobLocation(job .ID , cluster .OperatingSystem );
100
+
101
+ % The job directory as accessed by workers on the cluster
102
+ if cluster .HasSharedFilesystem
103
+ jobDirectoryOnCluster = cluster .getJobFolderOnCluster(job );
104
+ else
105
+ jobDirectoryOnCluster = remoteConnection .getRemoteJobLocation(job .ID , cluster .OperatingSystem );
106
+ end
100
107
% Specify the job wrapper script to use.
101
108
% Prior to R2019a, only the SMPD process manager is supported.
102
109
if verLessThan(' matlab' , ' 9.6' ) || ...
103
110
isprop(cluster .AdditionalProperties , ' UseSmpd' ) && cluster .AdditionalProperties .UseSmpd
104
- scriptName = ' communicatingJobWrapperSmpd.sh' ;
111
+ jobWrapperName = ' communicatingJobWrapperSmpd.sh' ;
105
112
parallelEnvironment = ' matlabSmpd' ;
106
113
else
107
- scriptName = ' communicatingJobWrapper.sh' ;
114
+ jobWrapperName = ' communicatingJobWrapper.sh' ;
108
115
parallelEnvironment = ' matlab' ;
109
116
end
110
117
% The wrapper script is in the same directory as this file
111
118
dirpart = fileparts(mfilename(' fullpath' ));
112
- localScript = fullfile(dirpart , scriptName );
119
+ localScript = fullfile(dirpart , jobWrapperName );
113
120
% Copy the local wrapper script to the job directory
114
121
copyfile(localScript , localJobDirectory );
115
122
116
- % The command that will be executed on the remote host to run the job.
117
- remoteScriptName = sprintf(' %s%s%s ' , remoteJobDirectory , fileSeparator , scriptName );
118
- quotedScriptName = sprintf(' %s%s%s ' , quote , remoteScriptName , quote );
123
+ % The script to execute on the cluster to run the job
124
+ wrapperPath = sprintf(' %s%s%s ' , jobDirectoryOnCluster , fileSeparator , jobWrapperName );
125
+ quotedWrapperPath = sprintf(' %s%s%s ' , quote , wrapperPath , quote );
119
126
120
- % Choose a file for the output. Please note that currently, JobStorageLocation refers
121
- % to a directory on disk, but this may change in the future.
122
- logFile = sprintf(' %s%s%s ' , remoteJobDirectory , fileSeparator , sprintf(' Job%d .log' , job .ID ));
127
+ % Choose a file for the output
128
+ logFile = sprintf(' %s%s%s ' , jobDirectoryOnCluster , fileSeparator , sprintf(' Job%d .log' , job .ID ));
123
129
quotedLogFile = sprintf(' %s%s%s ' , quote , logFile , quote );
124
130
dctSchedulerMessage(5 , ' %s : Using %s as log file' , currFilename , quotedLogFile );
125
131
@@ -131,43 +137,64 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
131
137
numSlots = environmentProperties .NumberOfTasks ;
132
138
additionalSubmitArgs = sprintf(' -pe %s %d ' , parallelEnvironment , numSlots );
133
139
dctSchedulerMessage(4 , ' %s : Requesting %d slots' , currFilename , numSlots );
140
+ if cluster .NumThreads > 1
141
+ additionalSubmitArgs = sprintf(' -binding pe linear:%d %s ' , cluster .NumThreads , additionalSubmitArgs );
142
+ end
134
143
commonSubmitArgs = getCommonSubmitArgs(cluster );
135
144
additionalSubmitArgs = strtrim(sprintf(' %s %s ' , additionalSubmitArgs , commonSubmitArgs ));
136
145
137
146
% Create a script to submit a Grid Engine job - this will be created in the job directory
138
147
dctSchedulerMessage(5 , ' %s : Generating script for job.' , currFilename );
139
- localScriptName = tempname(localJobDirectory );
140
- [~ , scriptName ] = fileparts(localScriptName );
141
- remoteScriptLocation = sprintf(' %s%s%s%s%s ' , quote , remoteJobDirectory , fileSeparator , scriptName , quote );
142
- createSubmitScript(localScriptName , jobName , quotedLogFile , quotedScriptName , ...
148
+ localSubmitScriptPath = tempname(localJobDirectory );
149
+ createSubmitScript(localSubmitScriptPath , jobName , quotedLogFile , quotedWrapperPath , ...
143
150
variables , additionalSubmitArgs );
144
- % Create the command to run on the remote host.
145
- commandToRun = sprintf(' sh %s ' , remoteScriptLocation );
146
151
147
- % Start the mirror to copy all the job files over to the cluster
148
- dctSchedulerMessage(4 , ' %s : Starting mirror for job %d .' , currFilename , job .ID );
149
- remoteConnection .startMirrorForJob(job );
152
+ % Path to the submit script as seen by the cluster
153
+ [~ , submitScriptName ] = fileparts(localSubmitScriptPath );
154
+ submitScriptPathOnCluster = sprintf(' %s%s%s ' , jobDirectoryOnCluster , fileSeparator , submitScriptName );
155
+ quotedSubmitScriptPathOnCluster = sprintf(' %s%s%s ' , quote , submitScriptPathOnCluster , quote );
156
+
157
+ % Create the command to run on the cluster
158
+ commandToRun = sprintf(' sh %s ' , quotedSubmitScriptPathOnCluster );
159
+
160
+ if ~cluster .HasSharedFilesystem
161
+ % Start the mirror to copy all the job files over to the cluster
162
+ dctSchedulerMessage(4 , ' %s : Starting mirror for job %d .' , currFilename , job .ID );
163
+ remoteConnection .startMirrorForJob(job );
164
+ end
150
165
151
- % Add execute permissions to shell scripts
152
- remoteConnection .runCommand(sprintf( ...
153
- ' chmod u+x %s%s *.sh' , remoteJobDirectory , fileSeparator ));
166
+ if isprop(cluster .AdditionalProperties , ' ClusterHost' )
167
+ % Add execute permissions to shell scripts
168
+ runSchedulerCommand(cluster , sprintf( ...
169
+ ' chmod u+x %s%s *.sh' , jobDirectoryOnCluster , fileSeparator ));
170
+ % Convert line endings to Unix
171
+ runSchedulerCommand(cluster , sprintf( ...
172
+ ' dos2unix %s%s *.sh' , jobDirectoryOnCluster , fileSeparator ));
173
+ end
154
174
155
175
% Now ask the cluster to run the submission command
156
176
dctSchedulerMessage(4 , ' %s : Submitting job using command:\n\t%s ' , currFilename , commandToRun );
157
- % Execute the command on the remote host.
158
- [cmdFailed , cmdOut ] = remoteConnection .runCommand(commandToRun );
177
+ try
178
+ [cmdFailed , cmdOut ] = runSchedulerCommand(cluster , commandToRun );
179
+ catch err
180
+ cmdFailed = true ;
181
+ cmdOut = err .message ;
182
+ end
159
183
if cmdFailed
160
- % Stop the mirroring if we failed to submit the job - this will also
161
- % remove the job files from the remote location
162
- % Only stop mirroring if we are actually mirroring
163
- if remoteConnection .isJobUsingConnection(job .ID )
164
- dctSchedulerMessage(5 , ' %s : Stopping the mirror for job %d .' , currFilename , job .ID );
165
- try
166
- remoteConnection .stopMirrorForJob(job );
167
- catch err
168
- warning(' parallelexamples:GenericGridEngine:FailedToStopMirrorForJob' , ...
169
- ' Failed to stop the file mirroring for job %d .\n Reason: %s ' , ...
170
- job .ID , err .getReport );
184
+ if ~cluster .HasSharedFilesystem
185
+ % Stop the mirroring if we failed to submit the job - this will also
186
+ % remove the job files from the remote location
187
+ remoteConnection = getRemoteConnection(cluster );
188
+ % Only stop mirroring if we are actually mirroring
189
+ if remoteConnection .isJobUsingConnection(job .ID )
190
+ dctSchedulerMessage(5 , ' %s : Stopping the mirror for job %d .' , currFilename , job .ID );
191
+ try
192
+ remoteConnection .stopMirrorForJob(job );
193
+ catch err
194
+ warning(' parallelexamples:GenericGridEngine:FailedToStopMirrorForJob' , ...
195
+ ' Failed to stop the file mirroring for job %d .\n Reason: %s ' , ...
196
+ job .ID , err .getReport );
197
+ end
171
198
end
172
199
end
173
200
error(' parallelexamples:GenericGridEngine:FailedToSubmitJob' , ...
@@ -188,11 +215,16 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
188
215
end
189
216
190
217
% Store the scheduler ID for each task and the job cluster data
191
- % Set the cluster host and remote job storage location on the job cluster data
192
- jobData = struct(' type' , ' generic' , ...
193
- ' RemoteHost' , remoteConnection .Hostname , ...
194
- ' RemoteJobStorageLocation' , remoteConnection .JobStorageLocation , ...
195
- ' HasDoneLastMirror' , false );
218
+ jobData = struct(' type' , ' generic' );
219
+ if isprop(cluster .AdditionalProperties , ' ClusterHost' )
220
+ % Store the cluster host
221
+ jobData.RemoteHost = remoteConnection .Hostname ;
222
+ end
223
+ if ~cluster .HasSharedFilesystem
224
+ % Store the remote job storage location
225
+ jobData.RemoteJobStorageLocation = remoteConnection .JobStorageLocation ;
226
+ jobData.HasDoneLastMirror = false ;
227
+ end
196
228
if verLessThan(' matlab' , ' 9.7' ) % schedulerID stored in job data
197
229
jobData.ClusterJobIDs = jobIDs ;
198
230
else % schedulerID on task since 19b
0 commit comments