-
Notifications
You must be signed in to change notification settings - Fork 1k
fix(cluster_family): Cancel slot migration from incoming node on OOM #5000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -726,6 +726,8 @@ static string_view StateToStr(MigrationState state) { | |
return "ERROR"sv; | ||
case MigrationState::C_FINISHED: | ||
return "FINISHED"sv; | ||
case MigrationState::C_FATAL: | ||
return "FATAL"sv; | ||
} | ||
DCHECK(false) << "Unknown State value " << static_cast<underlying_type_t<MigrationState>>(state); | ||
return "UNDEFINED_STATE"sv; | ||
|
@@ -765,7 +767,6 @@ void ClusterFamily::DflySlotMigrationStatus(CmdArgList args, SinkReplyBuilder* b | |
}; | ||
|
||
for (const auto& m : incoming_migrations_jobs_) { | ||
// TODO add error status | ||
append_answer("in", m->GetSourceID(), node_id, m->GetState(), m->GetKeyCount(), | ||
m->GetErrorStr()); | ||
} | ||
|
@@ -834,7 +835,7 @@ ClusterFamily::TakeOutOutgoingMigrations(shared_ptr<ClusterConfig> new_config, | |
removed_slots.Merge(slots); | ||
LOG(INFO) << "Outgoing migration cancelled: slots " << slots.ToString() << " to " | ||
<< migration.GetHostIp() << ":" << migration.GetPort(); | ||
migration.Finish(); | ||
migration.Finish(MigrationState::C_FINISHED); | ||
res.migrations.push_back(std::move(*it)); | ||
outgoing_migration_jobs_.erase(it); | ||
} | ||
|
@@ -925,7 +926,7 @@ void ClusterFamily::InitMigration(CmdArgList args, SinkReplyBuilder* builder) { | |
|
||
if (!migration) { | ||
VLOG(1) << "Unrecognized incoming migration from " << source_id; | ||
return builder->SendSimpleString(OutgoingMigration::kUnknownMigration); | ||
return builder->SendSimpleString(kUnknownMigration); | ||
} | ||
|
||
if (migration->GetState() != MigrationState::C_CONNECTING) { | ||
|
@@ -936,6 +937,10 @@ void ClusterFamily::InitMigration(CmdArgList args, SinkReplyBuilder* builder) { | |
DeleteSlots(slots); | ||
} | ||
|
||
if (migration->GetState() == MigrationState::C_FATAL) { | ||
return builder->SendError(absl::StrCat("-", kIncomingMigrationOOM)); | ||
} | ||
|
||
migration->Init(flows_num); | ||
|
||
return builder->SendOk(); | ||
|
@@ -955,6 +960,7 @@ void ClusterFamily::DflyMigrateFlow(CmdArgList args, SinkReplyBuilder* builder, | |
cntx->conn()->SetName(absl::StrCat("migration_flow_", source_id)); | ||
|
||
auto migration = GetIncomingMigration(source_id); | ||
|
||
if (!migration) { | ||
return builder->SendError(kIdNotFound); | ||
} | ||
|
@@ -1033,15 +1039,19 @@ void ClusterFamily::DflyMigrateAck(CmdArgList args, SinkReplyBuilder* builder) { | |
[source_id = source_id](const auto& m) { return m.node_info.id == source_id; }); | ||
if (m_it == in_migrations.end()) { | ||
LOG(WARNING) << "migration isn't in config"; | ||
return builder->SendError(OutgoingMigration::kUnknownMigration); | ||
return builder->SendError(kUnknownMigration); | ||
} | ||
|
||
auto migration = GetIncomingMigration(source_id); | ||
if (!migration) | ||
return builder->SendError(kIdNotFound); | ||
|
||
if (!migration->Join(attempt)) { | ||
return builder->SendError("Join timeout happened"); | ||
if (migration->GetState() == MigrationState::C_FATAL) { | ||
return builder->SendError(absl::StrCat("-", kIncomingMigrationOOM)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need "-", maybe we can make it as part of the kIncomingMigrationOOM There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
} else { | ||
return builder->SendError("Join timeout happened"); | ||
} | ||
} | ||
|
||
ApplyMigrationSlotRangeToConfig(migration->GetSourceID(), migration->GetSlots(), true); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,10 +50,20 @@ class IncomingSlotMigration { | |
return source_id_; | ||
} | ||
|
||
// Switch to FATAL state and store error message | ||
void ReportFatalError(dfly::GenericError err) ABSL_LOCKS_EXCLUDED(state_mu_, error_mu_) { | ||
errors_count_.fetch_add(1, std::memory_order_relaxed); | ||
util::fb2::LockGuard lk_state(state_mu_); | ||
util::fb2::LockGuard lk_error(error_mu_); | ||
state_ = MigrationState::C_FATAL; | ||
last_error_ = std::move(err); | ||
} | ||
Comment on lines
+54
to
+60
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like this code. Let's think how to make it better. Maybe we can use state_mu_ for error too or merge this method with reportError There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, but i changed ReportError to check |
||
|
||
void ReportError(dfly::GenericError err) ABSL_LOCKS_EXCLUDED(error_mu_) { | ||
errors_count_.fetch_add(1, std::memory_order_relaxed); | ||
util::fb2::LockGuard lk(error_mu_); | ||
last_error_ = std::move(err); | ||
if (GetState() != MigrationState::C_FATAL) | ||
last_error_ = std::move(err); | ||
} | ||
|
||
std::string GetErrorStr() const ABSL_LOCKS_EXCLUDED(error_mu_) { | ||
|
@@ -75,6 +85,7 @@ class IncomingSlotMigration { | |
std::vector<std::unique_ptr<ClusterShardMigration>> shard_flows_; | ||
SlotRanges slots_; | ||
ExecutionState cntx_; | ||
|
||
mutable util::fb2::Mutex error_mu_; | ||
dfly::GenericError last_error_ ABSL_GUARDED_BY(error_mu_); | ||
std::atomic<size_t> errors_count_ = 0; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,8 @@ class OutgoingMigration::SliceSlotMigration : private ProtocolClient { | |
SliceSlotMigration(DbSlice* slice, ServerContext server_context, SlotSet slots, | ||
journal::Journal* journal, OutgoingMigration* om) | ||
: ProtocolClient(server_context), streamer_(slice, std::move(slots), journal, &exec_st_) { | ||
exec_st_.SwitchErrorHandler([om](auto ge) { om->Finish(std::move(ge)); }); | ||
exec_st_.SwitchErrorHandler( | ||
[om](auto ge) { om->Finish(MigrationState::C_ERROR, std::move(ge)); }); | ||
} | ||
|
||
~SliceSlotMigration() { | ||
|
@@ -138,10 +139,8 @@ void OutgoingMigration::OnAllShards( | |
}); | ||
} | ||
|
||
void OutgoingMigration::Finish(GenericError error) { | ||
auto next_state = MigrationState::C_FINISHED; | ||
void OutgoingMigration::Finish(MigrationState next_state, GenericError error) { | ||
if (error) { | ||
next_state = MigrationState::C_ERROR; | ||
LOG(WARNING) << "Finish outgoing migration for " << cf_->MyID() << ": " | ||
<< migration_info_.node_info.id << " with error: " << error.Format(); | ||
exec_st_.ReportError(std::move(error)); | ||
|
@@ -164,6 +163,7 @@ void OutgoingMigration::Finish(GenericError error) { | |
|
||
case MigrationState::C_SYNC: | ||
case MigrationState::C_ERROR: | ||
case MigrationState::C_FATAL: | ||
should_cancel_flows = true; | ||
break; | ||
} | ||
|
@@ -221,6 +221,14 @@ void OutgoingMigration::SyncFb() { | |
continue; | ||
} | ||
|
||
// Break outgoing migration if INIT from incoming node responded with OOM. Usually this will | ||
// happen on second iteration after first failed with OOM. Sending second INIT is required to | ||
// cleanup slots on incoming slot migration node. | ||
if (CheckRespSimpleError(kIncomingMigrationOOM)) { | ||
ChangeState(MigrationState::C_FATAL); | ||
break; | ||
} | ||
|
||
Comment on lines
+224
to
+231
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can move it in next if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see any problem to return error as error |
||
if (!CheckRespIsSimpleReply("OK")) { | ||
if (CheckRespIsSimpleReply(kUnknownMigration)) { | ||
const absl::Duration passed = absl::Now() - start_time; | ||
|
@@ -272,7 +280,11 @@ void OutgoingMigration::SyncFb() { | |
|
||
long attempt = 0; | ||
while (GetState() != MigrationState::C_FINISHED && !FinalizeMigration(++attempt)) { | ||
// process commands that were on pause and try again | ||
// Break loop and don't sleep in case of C_FATAL | ||
if (GetState() == MigrationState::C_FATAL) { | ||
break; | ||
} | ||
// Process commands that were on pause and try again | ||
VLOG(1) << "Waiting for migration to finalize..."; | ||
ThisFiber::SleepFor(500ms); | ||
} | ||
|
@@ -355,6 +367,12 @@ bool OutgoingMigration::FinalizeMigration(long attempt) { | |
return false; | ||
} | ||
|
||
// Check OOM from incoming slot migration on ACK request | ||
if (CheckRespSimpleError(kIncomingMigrationOOM)) { | ||
Finish(MigrationState::C_FATAL, std::string(kIncomingMigrationOOM)); | ||
return false; | ||
} | ||
|
||
if (!CheckRespFirstTypes({RespExpr::INT64})) { | ||
LOG(WARNING) << "Incorrect response type for " << cf_->MyID() << " : " | ||
<< migration_info_.node_info.id << " attempt " << attempt | ||
|
@@ -371,7 +389,7 @@ bool OutgoingMigration::FinalizeMigration(long attempt) { | |
} | ||
|
||
if (!exec_st_.GetError()) { | ||
Finish(); | ||
Finish(MigrationState::C_FINISHED); | ||
keys_number_ = cluster::GetKeyCount(migration_info_.slot_ranges); | ||
cf_->ApplyMigrationSlotRangeToConfig(migration_info_.node_info.id, migration_info_.slot_ranges, | ||
false); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be error not string, like below
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Part of code that was handling
kUnknownMigration
is using both SendError and SendSimpleString. This PR didn't change that but it moved this constant into different location.kUnknownMigration
will be matched only ifSendSimpleString
is used.