Skip to content

Commit 009b13f

Browse files
committed
EAMxx: fix how we find filename in rpointer
* We must consider avg type and output freq specs, to avoid name clashing * Since rhist filename contains output control specs, no need to check that freq/freq_units/avg_type are unchanged upon restart: if rhist file is found, they are ok.
1 parent 38fedc3 commit 009b13f

File tree

4 files changed

+87
-62
lines changed

4 files changed

+87
-62
lines changed

components/eamxx/src/share/io/scream_io_utils.cpp

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,54 @@
44
#include "share/util/scream_utils.hpp"
55

66
#include <fstream>
7+
#include <regex>
78

89
namespace scream {
910

1011
std::string find_filename_in_rpointer (
1112
const std::string& filename_prefix,
1213
const bool model_restart,
1314
const ekat::Comm& comm,
14-
const util::TimeStamp& run_t0)
15+
const util::TimeStamp& run_t0,
16+
const OutputAvgType avg_type,
17+
const IOControl& control)
1518
{
1619
std::string filename;
1720
bool found = false;
1821
std::string content;
1922
std::string suffix = model_restart ? ".r." : ".rhist.";
23+
std::string pattern_str = filename_prefix + suffix;
24+
25+
// The AD will pass a default constructed control, since it doesn't know the values
26+
// of REST_N/REST_OPTION used in the previous run. Also, model restart is *always* INSTANT.
27+
if (model_restart) {
28+
EKAT_REQUIRE_MSG (avg_type==OutputAvgType::Instant,
29+
"Error! Model restart output should have INSTANT avg type.\n"
30+
" - input avg_type: " + e2str(avg_type) + "\n");
31+
pattern_str += e2str(OutputAvgType::Instant) + R"(.n(step|sec|min|hour|day|month|year)s_x\d+)";
32+
} else {
33+
EKAT_REQUIRE_MSG (control.output_enabled(),
34+
"Error! When restarting an output stream, we need a valid IOControl structure.\n"
35+
" - filename prefix: " + filename_prefix + "\n");
36+
pattern_str += e2str(avg_type) + "." + control.frequency_units + "_x" + std::to_string(control.frequency);
37+
}
38+
pattern_str += "." + run_t0.to_string() + ".nc";
39+
std::regex pattern (pattern_str);
40+
2041
if (comm.am_i_root()) {
2142
std::ifstream rpointer_file;
43+
2244
std::string line;
2345
rpointer_file.open("rpointer.atm");
2446

25-
// If the timestamp is in the filename, then the filename ends with "S.nc",
26-
// with S being the string representation of the timestamp
27-
auto ts_len = run_t0.to_string().size();
28-
auto extract_ts = [&] (const std::string& line) -> util::TimeStamp {
29-
auto min_size = ts_len+3;
30-
if (line.size()>=min_size) {
31-
auto ts_str = line.substr(line.size()-min_size,ts_len);
32-
auto ts = util::str_to_time_stamp(ts_str);
33-
return ts;
34-
} else {
35-
return util::TimeStamp();
36-
}
37-
};
38-
39-
while ((rpointer_file >> line) and not found) {
47+
while (std::getline(rpointer_file,line)) {
4048
content += line + "\n";
4149

42-
found = line.find(filename_prefix+suffix) != std::string::npos &&
43-
extract_ts(line)==run_t0;
44-
filename = line;
50+
if (std::regex_match(line,pattern)) {
51+
filename = line;
52+
found = true;
53+
break;
54+
}
4555
}
4656
}
4757

@@ -52,18 +62,23 @@ std::string find_filename_in_rpointer (
5262
if (not found) {
5363
broadcast_string(content,comm,comm.root_rank());
5464

55-
// If the history restart file is not found, it must be because the last
56-
// model restart step coincided with a model output step, in which case
57-
// a restart history file is not written.
58-
// If that's the case, *disable* output restart, by setting
59-
// 'Restart'->'Perform Restart' = false
60-
// in the input parameter list
61-
EKAT_ERROR_MSG (
62-
"Error! Restart requested, but no restart file found in 'rpointer.atm'.\n"
63-
" restart filename prefix: " + filename_prefix + "\n"
64-
" restart file type: " + std::string(model_restart ? "model restart" : "history restart") + "\n"
65-
" run t0 : " + run_t0.to_string() + "\n"
66-
" rpointer content:\n" + content);
65+
if (model_restart) {
66+
EKAT_ERROR_MSG (
67+
"Error! Restart requested, but no model restart file found in 'rpointer.atm'.\n"
68+
" model restart filename prefix: " + filename_prefix + "\n"
69+
" run t0 : " + run_t0.to_string() + "\n"
70+
" rpointer content:\n" + content + "\n\n");
71+
} else {
72+
EKAT_ERROR_MSG (
73+
"Error! Restart requested, but no history restart file found in 'rpointer.atm'.\n"
74+
" hist restart filename prefix: " + filename_prefix + "\n"
75+
" run t0 : " + run_t0.to_string() + "\n"
76+
" avg_type : " + e2str(avg_type) + "\n"
77+
" output freq : " + std::to_string(control.frequency) + "\n"
78+
" output freq units: " + control.frequency_units + "\n"
79+
" rpointer content:\n" + content + "\n\n"
80+
" Did you change output specs (avg type, freq, or freq units) across restart? If so, please, remember that it is not allowed.\n");
81+
}
6782
}
6883

6984
// Have the root rank communicate the nc filename

components/eamxx/src/share/io/scream_io_utils.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef SCREAM_IO_UTILS_HPP
22
#define SCREAM_IO_UTILS_HPP
33

4+
#include "scream_io_control.hpp"
45
#include "share/util/scream_time_stamp.hpp"
56

67
#include <ekat/util/ekat_string_utils.hpp>
@@ -59,11 +60,17 @@ inline OutputAvgType str2avg (const std::string& s) {
5960
return OAT::Invalid;
6061
}
6162

63+
// The AD will pass a default constructed control, since it doesn't know the values
64+
// of REST_N/REST_OPTION used in the previous run
65+
// Output streams MUST pass a valid control structure, cause we need to differentiate
66+
// between, e.g., streams with same filename prefix, but different output freq specs
6267
std::string find_filename_in_rpointer (
63-
const std::string& casename,
68+
const std::string& filename_prefix,
6469
const bool model_restart,
6570
const ekat::Comm& comm,
66-
const util::TimeStamp& run_t0);
71+
const util::TimeStamp& run_t0,
72+
const OutputAvgType avg_type = OutputAvgType::Instant,
73+
const IOControl& control = {});
6774

6875
struct LongNames {
6976

components/eamxx/src/share/io/scream_output_manager.cpp

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,9 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,
171171

172172
if (perform_history_restart) {
173173
using namespace scorpio;
174-
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0);
174+
IOFileSpecs hist_restart_specs;
175+
hist_restart_specs.ftype = FileType::HistoryRestart;
176+
auto rhist_file = find_filename_in_rpointer(hist_restart_filename_prefix,false,m_io_comm,m_run_t0,m_avg_type,m_output_control);
175177

176178
scorpio::register_file(rhist_file,scorpio::Read);
177179
// From restart file, get the time of last write, as well as the current size of the avg sample
@@ -196,22 +198,8 @@ setup (const ekat::Comm& io_comm, const ekat::ParameterList& params,
196198

197199
// We do NOT allow changing output specs across restart. If you do want to change
198200
// any of these, you MUST start a new output stream (e.g., setting 'Perform Restart: false')
199-
auto old_freq = scorpio::get_attribute<int>(rhist_file,"GLOBAL","averaging_frequency");
200-
EKAT_REQUIRE_MSG (old_freq == m_output_control.frequency,
201-
"Error! Cannot change frequency when performing history restart.\n"
202-
" - old freq: " << old_freq << "\n"
203-
" - new freq: " << m_output_control.frequency << "\n");
204-
auto old_freq_units = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_frequency_units");
205-
EKAT_REQUIRE_MSG (old_freq_units == m_output_control.frequency_units,
206-
"Error! Cannot change frequency units when performing history restart.\n"
207-
" - old freq units: " << old_freq_units << "\n"
208-
" - new freq units: " << m_output_control.frequency_units << "\n");
209-
auto old_avg_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","averaging_type");
210-
EKAT_REQUIRE_MSG (old_avg_type == e2str(m_avg_type),
211-
"Error! Cannot change avg type when performing history restart.\n"
212-
" - old avg type: " << old_avg_type + "\n"
213-
" - new avg type: " << e2str(m_avg_type) << "\n");
214-
201+
// NOTE: we do not check that freq/freq_units/avg_type are not changed: since we used
202+
// that info to find the correct rhist file, we already know that they match!
215203
auto old_storage_type = scorpio::get_attribute<std::string>(rhist_file,"GLOBAL","file_max_storage_type");
216204
EKAT_REQUIRE_MSG (old_storage_type == e2str(m_output_file_specs.storage.type),
217205
"Error! Cannot change file storage type when performing history restart.\n"

components/eamxx/src/share/io/tests/io_utils.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
TEST_CASE ("find_filename_in_rpointer") {
1010
using namespace scream;
1111

12+
constexpr auto AVG = OutputAvgType::Average;
13+
constexpr auto INST = OutputAvgType::Instant;
14+
1215
ekat::Comm comm(MPI_COMM_WORLD);
1316

1417
util::TimeStamp t0({2023,9,7},{12,0,0});
@@ -17,21 +20,33 @@ TEST_CASE ("find_filename_in_rpointer") {
1720
// Create a dummy rpointer
1821
std::ofstream rpointer ("rpointer.atm");
1922

20-
rpointer << "foo.r." + t0.to_string() + ".nc\n";
21-
rpointer << "bar2.rhist." + t0.to_string() + ".nc\n";
22-
rpointer << "bar.rhist." + t0.to_string() + ".nc\n";
23-
rpointer.close();
23+
IOControl foo_c, bar_c, bar2_c;
24+
foo_c.frequency = 3; foo_c.frequency_units = "nsteps";
25+
foo_c.frequency = 1; foo_c.frequency_units = "ndays";
26+
foo_c.frequency = 5; foo_c.frequency_units = "nhours";
2427

25-
// Now test find_filename_in_rpointer with different inputs
28+
std::string foo_fname = "foo.r.INSTANT.nsteps_x3." + t0.to_string() + ".nc";
29+
std::string bar_fname = "bar.rhist.AVERAGE.ndays_x1." + t0.to_string() + ".nc";
30+
std::string bar2_fname = "bar.rhist.AVERAGE.nhours_x6." + t0.to_string() + ".nc";
2631

27-
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0)); // wrong prefix
28-
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1)); // wrong timestamp
29-
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0)); // bar is not model restart
30-
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0)); // foo is model restart
32+
rpointer << foo_fname<< "\n";
33+
rpointer << bar_fname<< "\n";
34+
rpointer << bar2_fname << "\n";
35+
rpointer.close();
3136

32-
REQUIRE (find_filename_in_rpointer("bar", false,comm,t0)==("bar.rhist."+t0.to_string()+".nc"));
33-
REQUIRE (find_filename_in_rpointer("bar2",false,comm,t0)==("bar2.rhist."+t0.to_string()+".nc"));
34-
REQUIRE (find_filename_in_rpointer("foo", true, comm,t0)==("foo.r."+t0.to_string()+".nc"));
37+
// Now test find_filename_in_rpointer with different inputs
38+
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG)); // missing control (needed for rhist files)
39+
REQUIRE_THROWS (find_filename_in_rpointer("baz",false,comm,t0,AVG,bar_c)); // wrong prefix
40+
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t1,AVG,bar_c)); // wrong timestamp
41+
REQUIRE_THROWS (find_filename_in_rpointer("bar",true, comm,t0,AVG,bar_c)); // bar is not model restart
42+
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar_c)); // wrong avg type
43+
REQUIRE_THROWS (find_filename_in_rpointer("bar",false,comm,t0,INST,bar2_c)); // wrong freq specs
44+
REQUIRE_THROWS (find_filename_in_rpointer("foo",false,comm,t0,INST,foo_c)); // foo is model restart
45+
REQUIRE_THROWS (find_filename_in_rpointer("foo",true,comm,t0,AVG)); // model restart MUST be INSTANT
46+
47+
REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar_c)==bar_fname);
48+
REQUIRE (find_filename_in_rpointer("bar",false,comm,t0,AVG,bar2_c)==bar2_fname);
49+
REQUIRE (find_filename_in_rpointer("foo",true, comm,t0)==foo_fname);
3550
}
3651

3752
TEST_CASE ("io_control") {

0 commit comments

Comments
 (0)