From fee5dbf3e057674e09c96020a8faa17e0ffc60ec Mon Sep 17 00:00:00 2001 From: qhu Date: Fri, 26 Apr 2024 15:52:32 +0000 Subject: [PATCH] [BACKPORT 2.14][#21371] docdb: Hidden split parent tablets showing as leaderless in master's leaderless tablet endpoint Summary: Original commit: 5f383333fa3d268293d25a024253cfa2bd2dfc45 / D33401 Customer Impact: Universe is showing the load balanced status as true, but upon checking replica info, it's discovered that there are leaderless tablets, which are actually hidden split parent tablets. If you have Point-In-Time Recovery (PITR) enabled, the tablets aren't actually deleted if you drop the table until the retention window ends; they are just put into a hidden state. This issue can also occur in non-PITR scenarios where the split parent tablets are left undeleted for a long time (>120s). This can happen if one of the split children failed to register. Cause: When the tablet is marked as TABLET_DATA_SPLIT_COMPLETED, metrics for this tablet are skipped from being sent to the master. Consequently, the leader lease is not updated on the master. The fix is to also report metrics for tablets with state TABLET_DATA_SPLIT_COMPLETED. Jira: DB-10268 Test Plan: MasterPathHandlersItest.TestHiddenSplitParentTablet Reviewers: asrivastava, zdrudi Reviewed By: asrivastava Subscribers: bogdan, qhu, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D33716 --- .../master_path_handlers-itest.cc | 49 +++++++++++++++++++ ...tserver_metrics_heartbeat_data_provider.cc | 3 +- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/yb/integration-tests/master_path_handlers-itest.cc b/src/yb/integration-tests/master_path_handlers-itest.cc index e5b86115216f..40e521f537fb 100644 --- a/src/yb/integration-tests/master_path_handlers-itest.cc +++ b/src/yb/integration-tests/master_path_handlers-itest.cc @@ -69,8 +69,10 @@ DECLARE_int32(follower_unavailable_considered_failed_sec); DECLARE_int32(cleanup_split_tablets_interval_sec); DECLARE_int32(catalog_manager_bg_task_wait_ms); +DECLARE_int32(tserver_heartbeat_metrics_interval_ms); DECLARE_bool(enable_automatic_tablet_splitting); DECLARE_int32(leaderless_tablet_alert_delay_secs); +DECLARE_bool(TEST_skip_deleting_split_tablets); namespace yb { namespace master { @@ -323,6 +325,53 @@ TEST_F_EX(MasterPathHandlersItest, Forward, MultiMasterPathHandlersItest) { } } +class TabletSplitMasterPathHandlersItest : public MasterPathHandlersItest { + public: + void SetUp() override { + ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 1; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_heartbeat_metrics_interval_ms) = 1000; + MasterPathHandlersItest::SetUp(); + } + + void InsertRows(const client::TableHandle& table, int num_rows_to_insert) { + auto session = client_->NewSession(); + session->SetTimeout(60s); + for (int i = 0; i < num_rows_to_insert; i++) { + auto insert = table.NewInsertOp(); + auto req = insert->mutable_request(); + QLAddInt32HashValue(req, i); + ASSERT_OK(session->TEST_ApplyAndFlush(insert)); + } + } +}; + +// Undeleted split parent tablets shouldn't be shown as leaderless. +TEST_F_EX( + MasterPathHandlersItest, TestUndeletedParentTablet, TabletSplitMasterPathHandlersItest) { + const auto kLeaderlessTabletAlertDelaySecs = 5; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_leaderless_tablet_alert_delay_secs) = + kLeaderlessTabletAlertDelaySecs; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true; + + CreateTestTable(1 /* num_tablets */); + + client::TableHandle table; + ASSERT_OK(table.Open(table_name, client_.get())); + InsertRows(table, /* num_rows_to_insert = */ 500); + + auto& catalog_manager = ASSERT_RESULT(cluster_->GetLeaderMiniMaster())->catalog_manager(); + auto tablet = catalog_manager.GetTableInfo(table->id())->GetTablets()[0]; + + ASSERT_OK(yb_admin_client_->FlushTables( + {table_name}, false /* add_indexes */, 30 /* timeout_secs */, false /* is_compaction */)); + ASSERT_OK(catalog_manager.TEST_SplitTablet(tablet, 1 /* split_hash_code */)); + + SleepFor(kLeaderlessTabletAlertDelaySecs * 1s); + string result = GetLeaderlessTabletsString(); + ASSERT_EQ(result.find(tablet->id()), string::npos); +} + class MasterPathHandlersExternalItest : public MasterPathHandlersBaseItest { public: void InitCluster() override { diff --git a/src/yb/tserver/tserver_metrics_heartbeat_data_provider.cc b/src/yb/tserver/tserver_metrics_heartbeat_data_provider.cc index fa4ba55cf53d..d61ba5a128f4 100644 --- a/src/yb/tserver/tserver_metrics_heartbeat_data_provider.cc +++ b/src/yb/tserver/tserver_metrics_heartbeat_data_provider.cc @@ -80,8 +80,7 @@ void TServerMetricsHeartbeatDataProvider::DoAddData( uncompressed_file_sizes += sizes.second; num_files += tablet->GetCurrentVersionNumSSTFiles(); if (should_add_tablet_data && tablet_peer->log_available() && - tablet_peer->tablet_metadata()->tablet_data_state() == - tablet::TabletDataState::TABLET_DATA_READY) { + CanServeTabletData(tablet_peer->tablet_metadata()->tablet_data_state())) { auto storage_metadata = req->add_storage_metadata(); storage_metadata->set_tablet_id(tablet_peer->tablet_id()); storage_metadata->set_sst_file_size(sizes.first);