Skip to content

Commit

Permalink
feat: new proving broker (#10174)
Browse files Browse the repository at this point in the history
This PR integrates the new proving broker implementation added in #8495
into the orchestrator and adds a new caching layer between the
orchestrator and the job broker able to resolve jobs that have already
been finished (in order to provide crash recovery for the prover-node).

---------

Co-authored-by: Santiago Palladino <[email protected]>
  • Loading branch information
alexghr and spalladino authored Nov 28, 2024
1 parent b5a6aa4 commit 6fd5fc1
Show file tree
Hide file tree
Showing 65 changed files with 2,330 additions and 1,092 deletions.
31 changes: 25 additions & 6 deletions docker-compose.provernet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ services:
ARCHIVER_POLLING_INTERVAL_MS: 1000
ARCHIVER_VIEM_POLLING_INTERVAL_MS: 1000
PROVER_VIEM_POLLING_INTERVAL_MS: 1000
PROVER_AGENT_ENABLED: false
PROVER_AGENT_COUNT: 0
PROVER_BROKER_HOST: http://aztec-prover-broker
PROVER_PUBLISHER_PRIVATE_KEY: "0xdbda1821b80551c9d65939329250298aa3472ba22feea921c0cf5d620ea67b97"
PROVER_REAL_PROOFS: "${PROVER_REAL_PROOFS:-false}"
PROVER_MINIMUM_ESCROW_AMOUNT: 1000000000
Expand All @@ -76,6 +77,8 @@ services:
depends_on:
aztec-node:
condition: service_healthy
aztec-prover-broker:
condition: service_healthy
healthcheck:
test: [ "CMD", "curl", "-fSs", "http://127.0.0.1:80/status" ]
interval: 3s
Expand All @@ -84,6 +87,21 @@ services:
command: [ "start", "--prover-node", "--archiver" ]
restart: on-failure:5

aztec-prover-broker:
image: "aztecprotocol/${IMAGE:-aztec:master}"
ports:
- "8084:80"
environment:
LOG_LEVEL: verbose
AZTEC_PORT: 80
healthcheck:
test: [ "CMD", "curl", "-fSs", "http://127.0.0.1:80/status" ]
interval: 3s
timeout: 30s
start_period: 120s
command: [ "start", "--prover-broker" ]
restart: on-failure:5

# Prover agent that connects to the prover-node for fetching proving jobs and executing them
# Multiple instances can be run, or PROVER_AGENT_CONCURRENCY can be increased to run multiple workers in a single instance
aztec-prover-agent:
Expand All @@ -93,23 +111,24 @@ services:
environment:
LOG_LEVEL: verbose
ETHEREUM_HOST: http://ethereum:8545
AZTEC_NODE_URL: http://aztec-prover # Deprecated, use PROVER_JOB_SOURCE_URL
PROVER_JOB_SOURCE_URL: http://aztec-prover
PROVER_BROKER_HOST: http://aztec-prover-broker
L1_CHAIN_ID: 31337
AZTEC_PORT: 80
PROVER_REAL_PROOFS: "${PROVER_REAL_PROOFS:-false}"
PROVER_TEST_DELAY_MS: "${PROVER_TEST_DELAY_MS:-0}"
PROVER_AGENT_CONCURRENCY: 2
BB_SKIP_CLEANUP: "${BB_SKIP_CLEANUP:-0}" # Persist tmp dirs for debugging
PROVER_ID: "${PROVER_ID:-0x01}"
volumes:
- ./log/aztec-prover-agent/:/usr/src/yarn-project/aztec/log:rw
- ./cache/bb-crs/:/root/.bb-crs:rw
- ./workdir/bb-prover/:/usr/src/yarn-project/bb:rw
depends_on:
aztec-prover:
aztec-prover-broker:
condition: service_healthy
command: [ "start", "--prover" ]
command: [ "start", "--prover-agent" ]
deploy:
mode: replicated
replicas: 2
restart: on-failure:5
healthcheck:
test: [ "CMD", "curl", "-fSs", "http://127.0.0.1:80/status" ]
Expand Down
9 changes: 9 additions & 0 deletions spartan/aztec-network/files/config/setup-service-addresses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,19 @@ else
PROVER_NODE_ADDR="http://${SERVICE_NAME}-prover-node.${NAMESPACE}:${PROVER_NODE_PORT}"
fi

if [ "${PROVER_BROKER_EXTERNAL_HOST}" != "" ]; then
PROVER_BROKER_ADDR="${PROVER_BROKER_EXTERNAL_HOST}"
elif [ "${NETWORK_PUBLIC}" = "true" ]; then
PROVER_BROKER_ADDR=$(get_service_address "prover-broker" "${PROVER_BROKER_PORT}")
else
PROVER_BROKER_ADDR="http://${SERVICE_NAME}-prover-broker.${NAMESPACE}:${PROVER_BROKER_PORT}"
fi


# Write addresses to file for sourcing
echo "export ETHEREUM_HOST=${ETHEREUM_ADDR}" >> /shared/config/service-addresses
echo "export BOOT_NODE_HOST=${BOOT_NODE_ADDR}" >> /shared/config/service-addresses
echo "export PROVER_NODE_HOST=${PROVER_NODE_ADDR}" >> /shared/config/service-addresses
echo "export PROVER_BROKER_HOST=${PROVER_BROKER_ADDR}" >> /shared/config/service-addresses
echo "Addresses configured:"
cat /shared/config/service-addresses
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ Service Address Setup Container
value: "{{ .Values.proverNode.externalHost }}"
- name: PROVER_NODE_PORT
value: "{{ .Values.proverNode.service.nodePort }}"
- name: PROVER_BROKER_PORT
value: "{{ .Values.proverBroker.service.nodePort }}"
- name: SERVICE_NAME
value: {{ include "aztec-network.fullname" . }}
volumeMounts:
Expand Down
21 changes: 10 additions & 11 deletions spartan/aztec-network/templates/prover-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ spec:
- -c
- |
source /shared/config/service-addresses
until curl -s -X POST ${PROVER_NODE_HOST}/status; do
echo "Waiting for Prover node ${PROVER_NODE_HOST} ..."
until curl -s -X POST ${PROVER_BROKER_HOST}/status; do
echo "Waiting for broker ${PROVER_BROKER_HOST} ..."
sleep 5
done
echo "Prover node is ready!"
echo "Broker is ready!"
{{- if .Values.telemetry.enabled }}
until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do
echo "Waiting for OpenTelemetry collector..."
Expand All @@ -77,8 +77,7 @@ spec:
- "-c"
- |
source /shared/config/service-addresses && \
PROVER_JOB_SOURCE_URL=${PROVER_NODE_HOST} \
node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --prover
node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --prover-agent
env:
- name: AZTEC_PORT
value: "{{ .Values.proverAgent.service.nodePort }}"
Expand All @@ -90,12 +89,12 @@ spec:
value: "{{ .Values.proverAgent.debug }}"
- name: PROVER_REAL_PROOFS
value: "{{ .Values.proverAgent.realProofs }}"
- name: PROVER_AGENT_ENABLED
value: "true"
- name: PROVER_AGENT_CONCURRENCY
value: {{ .Values.proverAgent.concurrency | quote }}
- name: HARDWARE_CONCURRENCY
value: {{ .Values.proverAgent.bb.hardwareConcurrency | quote }}
- name: PROVER_AGENT_COUNT
value: "1"
- name: PROVER_AGENT_POLL_INTERVAL_MS
value: "{{ .Values.proverAgent.pollIntervalMs }}"
- name: PROVER_AGENT_PROOF_TYPES
value: {{ join "," .Values.proverAgent.proofTypes | quote }}
- name: OTEL_RESOURCE_ATTRIBUTES
value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }}
- name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT
Expand Down
104 changes: 104 additions & 0 deletions spartan/aztec-network/templates/prover-broker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{{- if .Values.proverBroker.enabled }}
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: {{ include "aztec-network.fullname" . }}-prover-broker
labels:
{{- include "aztec-network.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.proverBroker.replicas }}
selector:
matchLabels:
{{- include "aztec-network.selectorLabels" . | nindent 6 }}
app: prover-broker
template:
metadata:
labels:
{{- include "aztec-network.selectorLabels" . | nindent 8 }}
app: prover-broker
spec:
serviceAccountName: {{ include "aztec-network.fullname" . }}-node
{{- if .Values.network.public }}
hostNetwork: true
{{- end }}
volumes:
- name: config
emptyDir: {}
- name: scripts
configMap:
name: {{ include "aztec-network.fullname" . }}-scripts
initContainers:
{{- include "aztec-network.serviceAddressSetupContainer" . | nindent 8 }}
- name: wait-for-prover-node
image: {{ .Values.images.aztec.image }}
command:
- /bin/bash
- -c
- |
source /shared/config/service-addresses
{{- if .Values.telemetry.enabled }}
until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do
echo "Waiting for OpenTelemetry collector..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
volumeMounts:
- name: config
mountPath: /shared/config
containers:
- name: prover-broker
image: "{{ .Values.images.aztec.image }}"
imagePullPolicy: {{ .Values.images.aztec.pullPolicy }}
volumeMounts:
- name: config
mountPath: /shared/config
command:
- "/bin/bash"
- "-c"
- |
source /shared/config/service-addresses && \
node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --prover-broker
env:
- name: AZTEC_PORT
value: "{{ .Values.proverBroker.service.nodePort }}"
- name: LOG_LEVEL
value: "{{ .Values.proverBroker.logLevel }}"
- name: LOG_JSON
value: "1"
- name: DEBUG
value: "{{ .Values.proverBroker.debug }}"
- name: PROVER_BROKER_POLL_INTERVAL_MS
value: "{{ .Values.proverBroker.pollIntervalMs }}"
- name: PROVER_BROKER_JOB_TIMEOUT_MS
value: "{{ .Values.proverBroker.jobTimeoutMs }}"
- name: PROVER_BROKER_JOB_MAX_RETRIES
value: "{{ .Values.proverBroker.jobMaxRetries }}"
- name: PROVER_BROKER_DATA_DIRECTORY
value: "{{ .Values.proverBroker.dataDirectory }}"
- name: OTEL_RESOURCE_ATTRIBUTES
value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }}
- name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT
value: {{ include "aztec-network.otelCollectorMetricsEndpoint" . | quote }}
- name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
value: {{ include "aztec-network.otelCollectorTracesEndpoint" . | quote }}
- name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT
value: {{ include "aztec-network.otelCollectorLogsEndpoint" . | quote }}
resources:
{{- toYaml .Values.proverBroker.resources | nindent 12 }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "aztec-network.fullname" . }}-prover-broker
labels:
{{- include "aztec-network.labels" . | nindent 4 }}
spec:
type: ClusterIP
selector:
{{- include "aztec-network.selectorLabels" . | nindent 4 }}
app: prover-broker
ports:
- port: {{ .Values.proverBroker.service.nodePort }}
name: node
{{ end }}
36 changes: 31 additions & 5 deletions spartan/aztec-network/templates/prover-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ spec:
sleep 5
done
echo "Ethereum node is ready!"
if [ "${PROVER_BROKER_ENABLED}" == "false" ]; then
until curl -s -X POST ${PROVER_BROKER_HOST}/status; do
echo "Waiting for broker ${PROVER_BROKER_HOST} ..."
sleep 5
done
echo "Broker is ready!"
else
echo "Using built-in job broker"
fi
{{- if .Values.telemetry.enabled }}
until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do
echo "Waiting for OpenTelemetry collector..."
Expand All @@ -54,6 +65,10 @@ spec:
volumeMounts:
- name: config
mountPath: /shared/config
env:
- name: PROVER_BROKER_ENABLED
value: "{{ .Values.proverNode.proverBroker.enabled }}"

- name: configure-prover-env
image: "{{ .Values.images.aztec.image }}"
imagePullPolicy: {{ .Values.images.aztec.pullPolicy }}
Expand Down Expand Up @@ -107,15 +122,26 @@ spec:
value: "{{ .Values.proverNode.debug }}"
- name: PROVER_REAL_PROOFS
value: "{{ .Values.proverNode.realProofs }}"
- name: PROVER_AGENT_ENABLED
value: "{{ .Values.proverNode.proverAgentEnabled }}"
- name: PROVER_AGENT_COUNT
value: "{{ .Values.proverNode.proverAgent.count }}"
- name: PROVER_AGENT_POLL_INTERVAL_MS
value: "{{ .Values.proverNode.proverAgent.pollIntervalMs }}"
- name: PROVER_AGENT_PROOF_TYPES
value: {{ join "," .Values.proverNode.proverAgent.proofTypes | quote }}
- name: PROVER_BROKER_ENABLED
value: "{{ .Values.proverNode.proverBroker.enabled }}"
- name: PROVER_BROKER_POLL_INTERVAL_MS
value: "{{ .Values.proverNode.proverBroker.pollIntervalMs }}"
- name: PROVER_BROKER_JOB_TIMEOUT_MS
value: "{{ .Values.proverNode.proverBroker.jobTimeoutMs }}"
- name: PROVER_BROKER_JOB_MAX_RETRIES
value: "{{ .Values.proverNode.proverBroker.jobMaxRetries }}"
- name: PROVER_BROKER_DATA_DIRECTORY
value: "{{ .Values.proverNode.proverBroker.dataDirectory }}"
- name: PROVER_PUBLISHER_PRIVATE_KEY
value: "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80"
- name: OTEL_RESOURCE_ATTRIBUTES
value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }}
# get private proofs from the boot node
- name: PROVER_JOB_SOURCE_URL
value: "http://$(POD_IP):{{ .Values.proverNode.service.nodePort }}"
- name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT
value: {{ include "aztec-network.otelCollectorMetricsEndpoint" . | quote }}
- name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
Expand Down
28 changes: 26 additions & 2 deletions spartan/aztec-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,16 @@ proverNode:
logLevel: "debug"
debug: "aztec:*,-aztec:avm_simulator*,-aztec:libp2p_service*,-aztec:circuits:artifact_hash,-json-rpc*,-aztec:world-state:database,-aztec:l2_block_stream*"
realProofs: false
proverAgentEnabled: false
proverAgent:
count: 0
pollIntervalMs: 1000
proofTypes: []
proverBroker:
enabled: false
jobTimeoutMs: 30000
pollIntervalMs: 1000
jobMaxRetries: 3
dataDirectory: ""
resources:
requests:
memory: "2Gi"
Expand Down Expand Up @@ -206,17 +215,32 @@ proverAgent:
nodePort: 8083
enabled: true
replicas: 1
pollIntervalMs: 1000
proofTypes: ["foo", "bar", "baz"]
gke:
spotEnabled: false
logLevel: "debug"
debug: "aztec:*,-aztec:avm_simulator*,-aztec:libp2p_service*,-aztec:circuits:artifact_hash,-json-rpc*,-aztec:world-state:database,-aztec:l2_block_stream*"
realProofs: false
concurrency: 1
bb:
hardwareConcurrency: ""
nodeSelector: {}
resources: {}

proverBroker:
service:
nodePort: 8084
enabled: true
replicas: 1
jobTimeoutMs: 30000
pollIntervalMs: 1000
jobMaxRetries: 3
dataDirectory: ""
logLevel: "debug"
debug: "aztec:*,-aztec:avm_simulator*,-aztec:libp2p_service*,-aztec:circuits:artifact_hash,-json-rpc*,-aztec:world-state:database,-aztec:l2_block_stream*"
nodeSelector: {}
resources: {}

jobs:
deployL1Verifier:
enable: false
1 change: 1 addition & 0 deletions yarn-project/aztec-node/src/aztec-node/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ export class AztecNodeService implements AztecNode {
// now create the merkle trees and the world state synchronizer
const worldStateSynchronizer = await createWorldStateSynchronizer(config, archiver, telemetry);
const proofVerifier = config.realProofs ? await BBCircuitVerifier.new(config) : new TestCircuitVerifier();
log.info(`Aztec node accepting ${config.realProofs ? 'real' : 'test'} proofs`);

// create the tx pool and the p2p client, which will need the l2 block source
const p2pClient = await createP2PClient(config, archiver, proofVerifier, worldStateSynchronizer, telemetry);
Expand Down
Loading

0 comments on commit 6fd5fc1

Please sign in to comment.