Skip to content

Trust Quorum: Better match protocol in TLA+ spec #8363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions trust-quorum/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ derive_more.workspace = true
gfss.workspace = true
hex.workspace = true
hkdf.workspace = true
iddqd.workspace = true
rand = { workspace = true, features = ["getrandom"] }
secrecy.workspace = true
serde.workspace = true
Expand Down
11 changes: 11 additions & 0 deletions trust-quorum/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::crypto::{EncryptedRackSecret, RackSecret, Salt, Sha3_256Digest};
use crate::validators::ValidatedReconfigureMsg;
use crate::{Epoch, PlatformId, Threshold};
use gfss::shamir::{Share, SplitError};
use iddqd::{IdOrdItem, id_upcast};
use omicron_uuid_kinds::RackUuid;
use secrecy::ExposeSecret;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -50,6 +51,16 @@ pub struct Configuration {
pub previous_configuration: Option<PreviousConfiguration>,
}

impl IdOrdItem for Configuration {
type Key<'a> = Epoch;

fn key(&self) -> Self::Key<'_> {
self.epoch
}

id_upcast!();
}

impl Configuration {
/// Create a new configuration for the trust quorum
///
Expand Down
40 changes: 24 additions & 16 deletions trust-quorum/src/coordinator_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
//! State of a reconfiguration coordinator inside a [`crate::Node`]

use crate::crypto::{LrtqShare, Sha3_256Digest, ShareDigestLrtq};
use crate::messages::{PeerMsg, PrepareMsg};
use crate::messages::PeerMsg;
use crate::validators::{ReconfigurationError, ValidatedReconfigureMsg};
use crate::{Configuration, Envelope, Epoch, PlatformId};
use crate::{Configuration, Envelope, Epoch, PeerMsgKind, PlatformId};
use gfss::shamir::Share;
use slog::{Logger, o, warn};
use std::collections::{BTreeMap, BTreeSet};
Expand Down Expand Up @@ -55,24 +55,26 @@ impl CoordinatorState {
log: Logger,
now: Instant,
msg: ValidatedReconfigureMsg,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

msg should return this platforms config and share if easy enough. Then there is no need for the option.

) -> Result<(CoordinatorState, PrepareMsg), ReconfigurationError> {
) -> Result<(CoordinatorState, Configuration, Share), ReconfigurationError>
{
// Create a configuration for this epoch
let (config, shares) = Configuration::new(&msg)?;

let mut prepares = BTreeMap::new();
// `my_prepare_msg` is optional only so that we can fill it in via
// the loop. It will always become `Some`, as a `Configuration` always
// contains the coordinator as a member as validated by construction of
// `ValidatedReconfigureMsg`.
let mut my_prepare_msg: Option<PrepareMsg> = None;
// `my_config` and `my_share` are optional only so that we can fill them
// in via the loop. They will always become `Some`, as a `Configuration`
// always contains the coordinator as a member as validated by
// construction of `ValidatedReconfigureMsg`.
let mut my_config: Option<Configuration> = None;
let mut my_share: Option<Share> = None;
for (platform_id, share) in shares.into_iter() {
let prepare_msg = PrepareMsg { config: config.clone(), share };
if platform_id == *msg.coordinator_id() {
// The prepare message to add to our `PersistentState`
my_prepare_msg = Some(prepare_msg);
// The data to add to our `PersistentState`
my_config = Some(config.clone());
my_share = Some(share);
} else {
// Create a message that requires sending
prepares.insert(platform_id, prepare_msg);
prepares.insert(platform_id, (config.clone(), share));
}
}
let op = CoordinatorOperation::Prepare {
Expand All @@ -85,7 +87,7 @@ impl CoordinatorState {
// Safety: Construction of a `ValidatedReconfigureMsg` ensures that
// `my_platform_id` is part of the new configuration and has a share.
// We can therefore safely unwrap here.
Ok((state, my_prepare_msg.unwrap()))
Ok((state, my_config.unwrap(), my_share.unwrap()))
}

/// A reconfiguration from one group to another
Expand Down Expand Up @@ -161,11 +163,17 @@ impl CoordinatorState {
} => {}
CoordinatorOperation::CollectLrtqShares { members, shares } => {}
CoordinatorOperation::Prepare { prepares, prepare_acks } => {
for (platform_id, prepare) in prepares.clone().into_iter() {
let rack_id = self.reconfigure_msg.rack_id();
for (platform_id, (config, share)) in
prepares.clone().into_iter()
{
outbox.push(Envelope {
to: platform_id,
from: self.reconfigure_msg.coordinator_id().clone(),
msg: PeerMsg::Prepare(prepare),
msg: PeerMsg {
rack_id,
kind: PeerMsgKind::Prepare { config, share },
},
});
}
}
Expand Down Expand Up @@ -226,7 +234,7 @@ pub enum CoordinatorOperation {
},
Prepare {
/// The set of Prepares to send to each node
prepares: BTreeMap<PlatformId, PrepareMsg>,
prepares: BTreeMap<PlatformId, (Configuration, Share)>,

/// Acknowledgements that the prepare has been received
prepare_acks: BTreeSet<PlatformId>,
Expand Down
69 changes: 50 additions & 19 deletions trust-quorum/src/messages.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,39 +27,70 @@ pub struct ReconfigureMsg {

/// Messages sent between trust quorum members over a sprockets channel
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PeerMsg {
Prepare(PrepareMsg),
pub struct PeerMsg {
pub rack_id: RackUuid,
pub kind: PeerMsgKind,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PeerMsgKind {
/// Sent from a coordinator node to inform a peer about a new configuration
Prepare {
config: Configuration,
share: Share,
},

/// Acknowledge a successful prepare from a coordinator
PrepareAck(Epoch),
Commit(CommitMsg),

/// When a node learns about a commit for a given epoch
/// but does not have a `PrepareMsg`, it must ask for it
/// from another node.
GetPrepare(Epoch),
/// Retrieve a configuration for a given epoch from a node. Nodes only
/// respond if this is the current configuration and the requesting node is
/// a member of the configuration.
GetConfig(Epoch),

/// Nodes reply with `PrepareAndCommit` to `GetPrepare` requests when they
/// are able to.
PrepareAndCommit,
/// A configuration returned in response to `GetConfig`
Config(Configuration),

/// Request a node's key share for the given epoch from that node
GetShare(Epoch),

/// Return a node's key share in response to a `GetShare` message
Share {
epoch: Epoch,
share: Share,
},

// LRTQ shares are always at epoch 0
GetLrtqShare,

LrtqShare(LrtqShare),
}

/// The start of a reconfiguration sent from a coordinator to a specific peer
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PrepareMsg {
pub config: Configuration,
pub share: Share,
/// Inform a node that it is no longer part of the trust quorum as of the
/// given epoch
Expunged(Epoch),

/// Inform a node that it is utilizing an old committed onfiguration and
/// give it the new configuration.
///
/// As a result, a requesting node may have to retrieve key shares to
/// recompute its share if it never received a prepare message for this
/// epoch.
CommitAdvance(Configuration),
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CommitMsg {
epoch: Epoch,
impl PeerMsgKind {
pub fn name(&self) -> &str {
match self {
Self::Prepare { .. } => "prepare",
Self::PrepareAck(_) => "prepare_ack",
Self::GetConfig(_) => "get_config",
Self::Config(_) => "config",
Self::GetShare(_) => "get_share",
Self::Share { .. } => "share",
Self::GetLrtqShare => "get_lrtq_share",
Self::LrtqShare(_) => "lrtq_share",
Self::Expunged(_) => "expunged",
Self::CommitAdvance(_) => "commit_advance",
}
}
}
56 changes: 37 additions & 19 deletions trust-quorum/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,23 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A trust quorum node that implements the trust quorum protocol
//!
//! Nodes respond to function calls synchronously. They do not queue up replies
//! for later. If a request like a `prepare_and_commit` operation arrives from
//! Nexus, a node may not be ready to commit immediately. It may have to reach
//! out to other nodes to get its configuration and collect shares. None of
//! this happens in one tick through this FSM. Instead, outgoing messages will
//! be queued for sending and the call will return. Higher level software must
//! keep track of incoming requests and poll this FSM for responses. This makes
//! the logical code here simpler at the cost of implementing tracking at higher
//! levels. Fortunately, tracking is easier with async code, which drives this
//! Node, and so this should not be problematic.

use crate::validators::{ReconfigurationError, ValidatedReconfigureMsg};
use crate::{
CoordinatorState, Envelope, Epoch, PersistentState, PlatformId, messages::*,
};

use slog::{Logger, o, warn};
use slog::{Logger, error, o, warn};
use std::time::Instant;

/// An entity capable of participating in trust quorum
Expand Down Expand Up @@ -92,8 +102,18 @@ impl Node {
from: PlatformId,
msg: PeerMsg,
) -> Option<PersistentState> {
match msg {
PeerMsg::PrepareAck(epoch) => {
if let Some(rack_id) = self.persistent_state.rack_id() {
if rack_id != msg.rack_id {
error!(self.log, "Mismatched rack id";
"from" => %from,
"msg" => msg.kind.name(),
"expected" => %rack_id,
"got" => %msg.rack_id);
return None;
}
}
match msg.kind {
PeerMsgKind::PrepareAck(epoch) => {
self.handle_prepare_ack(from, epoch);
None
}
Expand Down Expand Up @@ -160,24 +180,25 @@ impl Node {
) -> Result<Option<PersistentState>, ReconfigurationError> {
// We have no committed configuration or lrtq ledger
if self.persistent_state.is_uninitialized() {
let (coordinator_state, my_prepare_msg) =
let (coordinator_state, my_config, my_share) =
CoordinatorState::new_uninitialized(
self.log.clone(),
now,
msg,
)?;
self.coordinator_state = Some(coordinator_state);
// Add the prepare to our `PersistentState`
self.persistent_state.shares.insert(my_config.epoch, my_share);
self.persistent_state
.prepares
.insert(my_prepare_msg.config.epoch, my_prepare_msg);
.configs
.insert_unique(my_config)
.expect("empty state");

return Ok(Some(self.persistent_state.clone()));
}

// We have a committed configuration that is not LRTQ
let config =
self.persistent_state.last_committed_configuration().unwrap();
self.persistent_state.latest_committed_configuration().unwrap();

self.coordinator_state = Some(CoordinatorState::new_reconfiguration(
self.log.clone(),
Expand Down Expand Up @@ -258,16 +279,13 @@ mod tests {
// It should include the `PrepareMsg` for this node.
assert!(persistent_state.lrtq.is_none());
assert!(persistent_state.commits.is_empty());
assert!(persistent_state.decommissioned.is_none());
// The only `PrepareMsg` is this one for the first epoch
assert_eq!(persistent_state.prepares.len(), 1);
assert!(persistent_state.expunged.is_none());
assert_eq!(persistent_state.configs.len(), 1);
assert_eq!(persistent_state.shares.len(), 1);

// Extract the configuration for our initial prepare msg
let config = &persistent_state
.prepares
.get(&input.reconfigure_msg.epoch)
.unwrap()
.config;
let config =
persistent_state.configs.get(&input.reconfigure_msg.epoch).unwrap();

assert_eq!(config.epoch, input.reconfigure_msg.epoch);
assert_eq!(config.coordinator, *node.platform_id());
Expand All @@ -280,8 +298,8 @@ mod tests {
assert_eq!(outbox.len(), config.members.len() - 1);
for envelope in outbox {
assert_matches!(
envelope.msg,
PeerMsg::Prepare(PrepareMsg { config: prepare_config, .. }) => {
envelope.msg.kind,
PeerMsgKind::Prepare{ config: prepare_config, .. } => {
assert_eq!(*config, prepare_config);
});
assert_eq!(envelope.from, config.coordinator);
Expand Down
Loading
Loading