Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pageserver: make control_plane_api & generations fully mandatory #10715

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions pageserver/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ pub struct PageServerConf {
/// not terrible.
pub background_task_maximum_delay: Duration,

pub control_plane_api: Option<Url>,
pub control_plane_api: Url,

/// JWT token for use with the control plane API.
pub control_plane_api_token: Option<SecretString>,
Expand Down Expand Up @@ -385,7 +385,8 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
control_plane_api,
control_plane_api: control_plane_api
.ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?,
control_plane_emergency_mode,
heatmap_upload_concurrency,
secondary_download_concurrency,
Expand Down Expand Up @@ -486,6 +487,7 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
synthetic_size_calculation_interval: Duration::from_secs(60),
background_task_maximum_delay: Duration::ZERO,
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
..Default::default()
};
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
Expand Down Expand Up @@ -555,9 +557,12 @@ mod tests {
use super::PageServerConf;

#[test]
fn test_empty_config_toml_is_valid() {
// we use Default impl of everything in this situation
fn test_minimal_config_toml_is_valid() {
// The minimal valid config for running a pageserver:
// - control_plane_api is mandatory, as pageservers cannotrun in isolation
// - we use Default impl of everything else in this situation
let input = r#"
control_plane_api = "http://localhost:6666"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("empty config is valid");
Expand Down
13 changes: 4 additions & 9 deletions pageserver/src/controller_upcall_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,8 @@ pub trait ControlPlaneGenerationsApi {
}

impl ControllerUpcallClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
};
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self {
let mut url = conf.control_plane_api.clone();

if let Ok(mut segs) = url.path_segments_mut() {
// This ensures that `url` ends with a slash if it doesn't already.
Expand All @@ -74,12 +69,12 @@ impl ControllerUpcallClient {
client = client.default_headers(headers);
}

Some(Self {
Self {
http_client: client.build().expect("Failed to construct HTTP client"),
base_url: url,
node_id: conf.id,
cancel: cancel.clone(),
})
}
}

async fn retry_http_forever<R, T>(
Expand Down
11 changes: 4 additions & 7 deletions pageserver/src/deletion_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ impl DeletionQueue {
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
pub fn new<C>(
remote_storage: GenericRemoteStorage,
controller_upcall_client: Option<C>,
controller_upcall_client: C,
conf: &'static PageServerConf,
) -> (Self, DeletionQueueWorkers<C>)
where
Expand Down Expand Up @@ -707,7 +707,7 @@ mod test {
async fn restart(&mut self) {
let (deletion_queue, workers) = DeletionQueue::new(
self.storage.clone(),
Some(self.mock_control_plane.clone()),
self.mock_control_plane.clone(),
self.harness.conf,
);

Expand Down Expand Up @@ -818,11 +818,8 @@ mod test {

let mock_control_plane = MockControlPlane::new();

let (deletion_queue, worker) = DeletionQueue::new(
storage.clone(),
Some(mock_control_plane.clone()),
harness.conf,
);
let (deletion_queue, worker) =
DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);

let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());

Expand Down
26 changes: 11 additions & 15 deletions pageserver/src/deletion_queue/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ where
tx: tokio::sync::mpsc::Sender<DeleterMessage>,

// Client for calling into control plane API for validation of deletes
controller_upcall_client: Option<C>,
controller_upcall_client: C,

// DeletionLists which are waiting generation validation. Not safe to
// execute until [`validate`] has processed them.
Expand Down Expand Up @@ -94,7 +94,7 @@ where
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
controller_upcall_client: Option<C>,
controller_upcall_client: C,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
cancel: CancellationToken,
) -> Self {
Expand Down Expand Up @@ -145,20 +145,16 @@ where
return Ok(());
}

let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
match controller_upcall_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
// The only way a validation call returns an error is when the cancellation token fires
return Err(DeletionQueueError::ShuttingDown);
}
let tenants_valid = match self
.controller_upcall_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
// The only way a validation call returns an error is when the cancellation token fires
return Err(DeletionQueueError::ShuttingDown);
}
} else {
// Control plane API disabled. In legacy mode we consider everything valid.
tenant_generations.keys().map(|k| (*k, true)).collect()
};

let mut validated_sequence: Option<u64> = None;
Expand Down
4 changes: 1 addition & 3 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4028,9 +4028,7 @@ impl Tenant {
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
) -> Tenant {
debug_assert!(
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
);
assert!(!attached_conf.location.generation.is_none());

let (state, mut rx) = watch::channel(state);

Expand Down
20 changes: 5 additions & 15 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,9 @@ async fn init_load_generations(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
);
emergency_generations(tenant_confs)
} else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
} else {
let client = ControllerUpcallClient::new(conf, cancel);

info!("Calling {} API to re-attach tenants", client.base_url());
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach(conf).await {
Expand All @@ -360,9 +362,6 @@ async fn init_load_generations(
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
}
} else {
info!("Control plane API not configured, tenant generations are disabled");
return Ok(None);
};

// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
Expand Down Expand Up @@ -1139,17 +1138,8 @@ impl TenantManager {
// Testing hack: if we are configured with no control plane, then drop the generation
// from upserts. This enables creating generation-less tenants even though neon_local
// always uses generations when calling the location conf API.
let attached_conf = if cfg!(feature = "testing") {
let mut conf = AttachedTenantConf::try_from(new_location_config)
.map_err(UpsertLocationError::BadRequest)?;
if self.conf.control_plane_api.is_none() {
conf.location.generation = Generation::none();
}
conf
} else {
AttachedTenantConf::try_from(new_location_config)
.map_err(UpsertLocationError::BadRequest)?
};
let attached_conf = AttachedTenantConf::try_from(new_location_config)
.map_err(UpsertLocationError::BadRequest)?;

let tenant = tenant_spawn(
self.conf,
Expand Down
3 changes: 1 addition & 2 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,8 +1084,7 @@ def __init__(self, config: NeonEnvBuilder):
"pageservers": [],
}

if self.control_plane_api is not None:
cfg["control_plane_api"] = self.control_plane_api
cfg["control_plane_api"] = self.control_plane_api

if self.control_plane_compute_hook_api is not None:
cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_pageserver_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Tests in this module exercise the pageserver's behavior around generation numbers,
as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require
of the pageserver are:
- Do not start a tenant without a generation number if control_plane_api is set
- Do not start a tenant without a generation number
- Remote objects must be suffixed with generation
- Deletions may only be executed after validating generation
- Updates to remote_consistent_lsn may only be made visible after validating generation
Expand Down
Loading