Skip to content

Commit 14b3c7a

Browse files
committed
Handle resuming into same subdir
Before this change, when resuming into the same sudir that we resumed from, the older data files would be overwritten, because we do not track the writer's file indices, and resuming for some writers would be very non-trivial (metadata handling etc). To avoid data loss, following the discussion around issue #56 on GitHub, we enact the following policy: 1. nothing changes if we resume into a new directory; 2. we prevent resuming into an existing directory that is _not_ the same we are resuming from; 3. when resuming into the same directory we resume from, the actual problem dir is shifted into the first available `resumeN` subdir.
1 parent 28df48c commit 14b3c7a

File tree

1 file changed

+61
-1
lines changed

1 file changed

+61
-1
lines changed

src/ProblemCore.cc

+61-1
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,16 @@ static int mkdir_p(string const& path, mode_t mode)
992992
return err == 0 ? 0 : errno;
993993
}
994994

995+
static string canonical_path(string const& path)
996+
{
997+
char *canonical_c = realpath(path.c_str(), NULL);
998+
string canonical(canonical_c);
999+
free(canonical_c);
1000+
return canonical;
1001+
}
1002+
1003+
constexpr mode_t problem_dir_mode = S_IRWXU | S_IRWXG | S_IRWXO;
1004+
9951005
string const&
9961006
ProblemCore::create_problem_dir(void)
9971007
{
@@ -1006,13 +1016,63 @@ ProblemCore::create_problem_dir(void)
10061016
time_str[17] = '\0';
10071017
m_problem_dir = "./tests/" + m_name + string(time_str);
10081018
}
1019+
1020+
// ensure the problem dir does _not_ have a / at the end
1021+
while (m_problem_dir.back() == '/')
1022+
m_problem_dir = m_problem_dir.substr(0, m_problem_dir.length() - 1);
10091023
cout << "Using problem dir " << m_problem_dir << endl;
10101024

10111025
// TODO it should be possible to specify a directory with %-like
10121026
// replaceable strings, such as %{problem} => problem name,
10131027
// %{time} => launch time, etc.
10141028

1015-
int err = mkdir_p(m_problem_dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
1029+
int err = mkdir_p(m_problem_dir.c_str(), problem_dir_mode);
1030+
1031+
// If the directory exists, and we are resuming, then
1032+
// 1. we only continue if we're resuming into the same directory we're resuming from
1033+
// 2. we resume into a subdirectory, to avoid overwriting the original data files
1034+
if (err == EEXIST && gdata->resume) {
1035+
// we want to check that the canonical form of the problem is the
1036+
// initial substring of the canonical form of the resume file, including the path
1037+
// separator (i.e. don't mess if we are resuming into tests/something from
1038+
// tests/something-else/data/hotfile). The canonical path doesn't have a terminating /,
1039+
// so we add one.
1040+
const string& resume_file = gdata->clOptions->resume_fname;
1041+
string canonical_problem_dir( canonical_path(m_problem_dir) + "/");
1042+
string canonical_resume_file( canonical_path(resume_file) );
1043+
if (canonical_resume_file.substr(0, canonical_problem_dir.length()) != canonical_problem_dir) {
1044+
throw runtime_error("refusing to resume from " +
1045+
resume_file + " (" + canonical_resume_file + ") into unrelated, existing " +
1046+
m_problem_dir + " (" + canonical_problem_dir + ")");
1047+
}
1048+
1049+
// OK, we're now in the ’resume in the same directory’ case. To avoid overwriting the previous
1050+
// simulation data, and since we cannot (currently) correctly recover the index and restart,
1051+
// we actually shift our problem dir into problemdir/resumeN, where N is the first available index
1052+
// (so that e.g. the third time we resume, our effective problem dir becomes
1053+
// problemdir/resume003/
1054+
// TODO FIXME of course it would actually be preferrable to resume correctly instead
1055+
unsigned resume_count = 0;
1056+
char new_dir[] = { '/', 'r', 'e', 's', 'u', 'm', 'e', '0', '0', '0', 0 };
1057+
string candidate;
1058+
cout << "Resuming into same directory, looking for next candidate" << endl;
1059+
#define MAX_RESUMES 1000
1060+
for ( ; resume_count < MAX_RESUMES; ++resume_count) {
1061+
snprintf(new_dir + 7, 4, "%03u", resume_count);
1062+
candidate = m_problem_dir + new_dir;
1063+
err = mkdir_p(candidate.c_str(), problem_dir_mode);
1064+
if (err != EEXIST) break;
1065+
cout << "\t" << candidate << " exists" << endl;
1066+
}
1067+
if (resume_count == MAX_RESUMES) {
1068+
throw runtime_error("Tried to resume too much (1,000 attempts made), please look into this");
1069+
}
1070+
m_problem_dir = candidate;
1071+
if (err == 0)
1072+
cout << "OK, resuming into " + candidate << endl;
1073+
// if err != 0, this will be caught by the next switch, shared with the non-resume case
1074+
}
1075+
10161076
switch (err) {
10171077
case EEXIST:
10181078
cerr << "WARNING: problem directory " << m_problem_dir << " exists already, overwriting." << endl;

0 commit comments

Comments
 (0)