Skip to content

Commit abe757c

Browse files
committed
v2.2.0
1 parent ddfd0aa commit abe757c

18 files changed

+269
-214
lines changed

cancelJobFcn.m

Lines changed: 2 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,8 @@
44
% Set your cluster's PluginScriptsLocation to the parent folder of this
55
% function to run it when you cancel a job.
66

7-
% Copyright 2010-2022 The MathWorks, Inc.
7+
% Copyright 2010-2023 The MathWorks, Inc.
88

9-
% Store the current filename for the errors, warnings and
10-
% dctSchedulerMessages
11-
currFilename = mfilename;
12-
if ~isa(cluster, 'parallel.Cluster')
13-
error('parallelexamples:GenericGridEngine:SubmitFcnError', ...
14-
'The function %s is for use with clusters created using the parcluster command.', currFilename)
15-
end
16-
17-
% Get the information about the actual cluster used
18-
data = cluster.getJobClusterData(job);
19-
if isempty(data)
20-
% This indicates that the job has not been submitted, so return true
21-
dctSchedulerMessage(1, '%s: Job cluster data was empty for job with ID %d.', currFilename, job.ID);
22-
OK = true;
23-
return
24-
end
25-
26-
% Get a simplified list of schedulerIDs to reduce the number of calls to
27-
% the scheduler.
28-
schedulerIDs = getSimplifiedSchedulerIDsForJob(job);
29-
erroredJobAndCauseStrings = cell(size(schedulerIDs));
30-
% Get the cluster to delete the job
31-
for ii = 1:length(schedulerIDs)
32-
schedulerID = schedulerIDs{ii};
33-
commandToRun = sprintf('qdel "%s"', schedulerID);
34-
dctSchedulerMessage(4, '%s: Canceling job on cluster using command:\n\t%s.', currFilename, commandToRun);
35-
try
36-
[cmdFailed, cmdOut] = runSchedulerCommand(cluster, commandToRun);
37-
catch err
38-
cmdFailed = true;
39-
cmdOut = err.message;
40-
end
41-
% If a job is already in a terminal state, qdel will return a failed
42-
% failed error code and cmdOut will be of the form:
43-
% 'denied: job "2979" does not exist'
44-
% If this happens we do not consider the command to have failed.
45-
if cmdFailed && ~contains(cmdOut, 'does not exist')
46-
% Keep track of all jobs that errored when being cancelled, either
47-
% through a bad exit code or if an error was thrown. We'll report
48-
% these later on.
49-
erroredJobAndCauseStrings{ii} = sprintf('Job ID: %s\tReason: %s', schedulerID, strtrim(cmdOut));
50-
dctSchedulerMessage(1, '%s: Failed to cancel job %s on cluster. Reason:\n\t%s', currFilename, schedulerID, cmdOut);
51-
end
52-
end
53-
54-
if ~cluster.HasSharedFilesystem
55-
% Only stop mirroring if we are actually mirroring
56-
remoteConnection = getRemoteConnection(cluster);
57-
if remoteConnection.isJobUsingConnection(job.ID)
58-
dctSchedulerMessage(5, '%s: Stopping the mirror for job %d.', currFilename, job.ID);
59-
try
60-
remoteConnection.stopMirrorForJob(job);
61-
catch err
62-
warning('parallelexamples:GenericGridEngine:FailedToStopMirrorForJob', ...
63-
'Failed to stop the file mirroring for job %d.\nReason: %s', ...
64-
job.ID, err.getReport);
65-
end
66-
end
67-
end
68-
69-
% Now warn about those jobs that we failed to cancel.
70-
erroredJobAndCauseStrings = erroredJobAndCauseStrings(~cellfun(@isempty, erroredJobAndCauseStrings));
71-
if ~isempty(erroredJobAndCauseStrings)
72-
warning('parallelexamples:GenericGridEngine:FailedToCancelJob', ...
73-
'Failed to cancel the following jobs on the cluster:\n%s', ...
74-
sprintf(' %s\n', erroredJobAndCauseStrings{:}));
75-
end
76-
OK = isempty(erroredJobAndCauseStrings);
9+
OK = cancelJobOnCluster(cluster, job);
7710

7811
end

cancelTaskFcn.m

Lines changed: 2 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,66 +4,8 @@
44
% Set your cluster's PluginScriptsLocation to the parent folder of this
55
% function to run it when you cancel a task.
66

7-
% Copyright 2020-2022 The MathWorks, Inc.
7+
% Copyright 2020-2023 The MathWorks, Inc.
88

9-
% Store the current filename for the errors, warnings and
10-
% dctSchedulerMessages
11-
currFilename = mfilename;
12-
if ~isa(cluster, 'parallel.Cluster')
13-
error('parallelexamples:GenericGridEngine:SubmitFcnError', ...
14-
'The function %s is for use with clusters created using the parcluster command.', currFilename)
15-
end
16-
17-
% Get the information about the actual cluster used
18-
data = cluster.getJobClusterData(task.Parent);
19-
if isempty(data)
20-
% This indicates that the parent job has not been submitted, so return true
21-
dctSchedulerMessage(1, '%s: Job cluster data was empty for the parent job with ID %d.', currFilename, task.Parent.ID);
22-
OK = true;
23-
return
24-
end
25-
% We can't cancel a single task of a communicating job on the scheduler
26-
% without cancelling the entire job, so warn and return in this case
27-
if ~strcmpi(task.Parent.Type, 'independent')
28-
OK = false;
29-
warning('parallelexamples:GenericGridEngine:FailedToCancelTask', ...
30-
'Unable to cancel a single task of a communicating job. If you want to cancel the entire job, use the cancel function on the job object instead.');
31-
return
32-
end
33-
34-
% Get the cluster to delete the task
35-
if verLessThan('matlab', '9.7') % schedulerID stored in job data
36-
schedulerIDs = data.ClusterJobIDs;
37-
schedulerID = schedulerIDs{task.ID};
38-
else % schedulerID on task since 19b
39-
schedulerID = task.SchedulerID;
40-
end
41-
erroredTaskAndCauseString = '';
42-
commandToRun = sprintf('qdel "%s"', schedulerID);
43-
dctSchedulerMessage(4, '%s: Canceling task on cluster using command:\n\t%s.', currFilename, commandToRun);
44-
try
45-
[cmdFailed, cmdOut] = runSchedulerCommand(cluster, commandToRun);
46-
catch err
47-
cmdFailed = true;
48-
cmdOut = err.message;
49-
end
50-
% If a job is already in a terminal state, qdel will return a failed
51-
% failed error code and cmdOut will be of the form:
52-
% 'denied: job "2979" does not exist'
53-
% If this happens we do not consider the command to have failed.
54-
if cmdFailed && ~contains(cmdOut, 'does not exist')
55-
% Record if the task errored when being cancelled, either through a bad
56-
% exit code or if an error was thrown. We'll report this as a warning.
57-
erroredTaskAndCauseString = sprintf('Job ID: %s\tReason: %s', schedulerID, strtrim(cmdOut));
58-
dctSchedulerMessage(1, '%s: Failed to cancel task %s on cluster. Reason:\n\t%s', currFilename, schedulerID, cmdOut);
59-
end
60-
61-
% Warn if task cancellation failed.
62-
OK = isempty(erroredTaskAndCauseString);
63-
if ~OK
64-
warning('parallelexamples:GenericGridEngine:FailedToCancelTask', ...
65-
'Failed to cancel the task on the cluster:\n %s\n', ...
66-
erroredTaskAndCauseString);
67-
end
9+
OK = cancelTaskOnCluster(cluster, task);
6810

6911
end

communicatingSubmitFcn.m

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,18 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
2525

2626
% Get the correct quote and file separator for the Cluster OS.
2727
% This check is unnecessary in this file because we explicitly
28-
% checked that the ClusterOsType is unix. This code is an example
28+
% checked that the clusterOS is unix. This code is an example
2929
% of how to deal with clusters that can be unix or pc.
3030
if strcmpi(clusterOS, 'unix')
3131
quote = '''';
3232
fileSeparator = '/';
33+
scriptExt = '.sh';
34+
shellCmd = 'sh';
3335
else
3436
quote = '"';
3537
fileSeparator = '\';
38+
scriptExt = '.bat';
39+
shellCmd = 'cmd /c';
3640
end
3741

3842
if isprop(cluster.AdditionalProperties, 'ClusterHost')
@@ -131,7 +135,7 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
131135
quotedLogFile = sprintf('%s%s%s', quote, logFile, quote);
132136
dctSchedulerMessage(5, '%s: Using %s as log file', currFilename, quotedLogFile);
133137

134-
jobName = sprintf('Job%d', job.ID);
138+
jobName = sprintf('MATLAB_R%s_Job%d', version('-release'), job.ID);
135139

136140
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
137141
%% CUSTOMIZATION MAY BE REQUIRED %%
@@ -144,29 +148,32 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
144148
end
145149
commonSubmitArgs = getCommonSubmitArgs(cluster);
146150
additionalSubmitArgs = strtrim(sprintf('%s %s', additionalSubmitArgs, commonSubmitArgs));
147-
148-
% Extension to use for scripts
149-
scriptExt = '.sh';
151+
if validatedPropValue(cluster.AdditionalProperties, 'DisplaySubmitArgs', 'logical', false)
152+
fprintf('Submit arguments: %s\n', additionalSubmitArgs);
153+
end
150154

151155
% Path to the submit script, to submit the Grid Engine job using qsub
152-
localSubmitScriptPath = [tempname(localJobDirectory) scriptExt];
153-
[~, submitScriptName, submitScriptExt] = fileparts(localSubmitScriptPath);
154-
submitScriptPathOnCluster = sprintf('%s%s%s%s', jobDirectoryOnCluster, fileSeparator, submitScriptName, submitScriptExt);
156+
submitScriptName = sprintf('submitScript%s', scriptExt);
157+
localSubmitScriptPath = sprintf('%s%s%s', localJobDirectory, fileSeparator, submitScriptName);
158+
submitScriptPathOnCluster = sprintf('%s%s%s', jobDirectoryOnCluster, fileSeparator, submitScriptName);
155159
quotedSubmitScriptPathOnCluster = sprintf('%s%s%s', quote, submitScriptPathOnCluster, quote);
156160

157161
% Path to the environment wrapper, which will set the environment variables
158162
% for the job then execute the job wrapper
159-
localEnvScriptPath = [tempname(localJobDirectory) scriptExt];
160-
[~, envScriptName, envScriptExt] = fileparts(localEnvScriptPath);
161-
envScriptPathOnCluster = sprintf('%s%s%s%s', jobDirectoryOnCluster, fileSeparator, envScriptName, envScriptExt);
163+
envScriptName = sprintf('environmentWrapper%s', scriptExt);
164+
localEnvScriptPath = sprintf('%s%s%s', localJobDirectory, fileSeparator, envScriptName);
165+
envScriptPathOnCluster = sprintf('%s%s%s', jobDirectoryOnCluster, fileSeparator, envScriptName);
162166
quotedEnvScriptPathOnCluster = sprintf('%s%s%s', quote, envScriptPathOnCluster, quote);
163167

168+
% Create the scripts to submit a Grid Engine job.
169+
% These will be created in the job directory.
170+
dctSchedulerMessage(5, '%s: Generating scripts for job %d', currFilename, job.ID);
164171
createEnvironmentWrapper(localEnvScriptPath, quotedWrapperPath, variables);
165172
createSubmitScript(localSubmitScriptPath, jobName, quotedLogFile, ...
166173
quotedEnvScriptPathOnCluster, additionalSubmitArgs);
167174

168175
% Create the command to run on the cluster
169-
commandToRun = sprintf('sh %s', quotedSubmitScriptPathOnCluster);
176+
commandToRun = sprintf('%s %s', shellCmd, quotedSubmitScriptPathOnCluster);
170177

171178
if ~cluster.HasSharedFilesystem
172179
% Start the mirror to copy all the job files over to the cluster
@@ -239,7 +246,7 @@ function communicatingSubmitFcn(cluster, job, environmentProperties)
239246
if verLessThan('matlab', '9.7') % schedulerID stored in job data
240247
jobData.ClusterJobIDs = jobIDs;
241248
else % schedulerID on task since 19b
242-
if numel(job.Tasks) == 1
249+
if isscalar(job.Tasks)
243250
schedulerIDs = jobIDs{1};
244251
else
245252
schedulerIDs = repmat(jobIDs, size(job.Tasks));

deleteJobFcn.m

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ function deleteJobFcn(cluster, job)
44
% Set your cluster's PluginScriptsLocation to the parent folder of this
55
% function to run it when you delete a job.
66

7-
% Copyright 2017-2022 The MathWorks, Inc.
7+
% Copyright 2017-2023 The MathWorks, Inc.
88

9-
cancelJobFcn(cluster, job);
9+
cancelJobOnCluster(cluster, job);
1010

1111
if cluster.HasSharedFilesystem
1212
% If we delete the job files before Grid Engine has actually finished
@@ -36,7 +36,9 @@ function deleteJobFcn(cluster, job)
3636
dctSchedulerMessage(4, '%s: Checking job does not exist on scheduler using command:\n\t%s.', currFilename, commandToRun);
3737
try
3838
[cmdFailed, ~] = runSchedulerCommand(cluster, commandToRun);
39-
catch err %#ok<NASGU>
39+
catch err
40+
dctSchedulerMessage(5, '%s: Command ''%s'' failed:\n%s.', ...
41+
currFilename, commandToRun, err.message);
4042
cmdFailed = true;
4143
end
4244

deleteTaskFcn.m

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
function deleteTaskFcn(cluster, task)
2-
%DELETEJOBFCN Deletes a job on cluster
2+
%DELETETASKFCN Deletes a job on Grid Engine
33
%
44
% Set your cluster's PluginScriptsLocation to the parent folder of this
55
% function to run it when you delete a job.
66

7-
% Copyright 2020-2022 The MathWorks, Inc.
7+
% Copyright 2020-2023 The MathWorks, Inc.
88

9-
cancelTaskFcn(cluster, task);
9+
cancelTaskOnCluster(cluster, task);
1010

1111
end

0 commit comments

Comments
 (0)