Skip to content

Commit 47c1b4c

Browse files
Move PVF code and PoV decompression to PVF host workers (#5142)
Closes #5071 This PR aims to * Move all the blocking decompression from the candidate validation subsystem to the PVF host workers; * Run the candidate validation subsystem on the non-blocking pool again. Upsides: no blocking operations in the subsystem's main loop. PVF throughput is not limited by the ability of the subsystem to decompress a lot of stuff. Correctness and homogeneity improve, as the artifact used to be identified by the hash of decompressed code, and now they are identified by the hash of compressed code, which coincides with the on-chain `ValidationCodeHash`. Downsides: the PVF code decompression is now accounted for in the PVF preparation timeout (be it pre-checking or actual preparation). Taking into account that the decompression duration is on the order of milliseconds, and the preparation timeout is on the order of seconds, I believe it is negligible.
1 parent 380cd21 commit 47c1b4c

File tree

26 files changed

+642
-525
lines changed

26 files changed

+642
-525
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

polkadot/node/core/candidate-validation/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ gum = { workspace = true, default-features = true }
1717

1818
sp-keystore = { workspace = true }
1919
sp-application-crypto = { workspace = true }
20-
sp-maybe-compressed-blob = { workspace = true, default-features = true }
2120
codec = { features = ["bit-vec", "derive"], workspace = true }
2221

2322
polkadot-primitives = { workspace = true, default-features = true }
@@ -36,5 +35,6 @@ sp-keyring = { workspace = true, default-features = true }
3635
futures = { features = ["thread-pool"], workspace = true }
3736
assert_matches = { workspace = true }
3837
polkadot-node-subsystem-test-helpers = { workspace = true }
38+
sp-maybe-compressed-blob = { workspace = true, default-features = true }
3939
sp-core = { workspace = true, default-features = true }
4040
polkadot-primitives-test-helpers = { workspace = true }

polkadot/node/core/candidate-validation/src/lib.rs

Lines changed: 47 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ use polkadot_node_core_pvf::{
2727
InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PossiblyInvalidError,
2828
PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
2929
};
30-
use polkadot_node_primitives::{
31-
BlockData, InvalidCandidate, PoV, ValidationResult, POV_BOMB_LIMIT, VALIDATION_CODE_BOMB_LIMIT,
32-
};
30+
use polkadot_node_primitives::{InvalidCandidate, PoV, ValidationResult};
3331
use polkadot_node_subsystem::{
3432
errors::RuntimeApiError,
3533
messages::{
@@ -41,9 +39,7 @@ use polkadot_node_subsystem::{
4139
};
4240
use polkadot_node_subsystem_util as util;
4341
use polkadot_overseer::ActiveLeavesUpdate;
44-
use polkadot_parachain_primitives::primitives::{
45-
ValidationParams, ValidationResult as WasmValidationResult,
46-
};
42+
use polkadot_parachain_primitives::primitives::ValidationResult as WasmValidationResult;
4743
use polkadot_primitives::{
4844
executor_params::{
4945
DEFAULT_APPROVAL_EXECUTION_TIMEOUT, DEFAULT_BACKING_EXECUTION_TIMEOUT,
@@ -504,21 +500,12 @@ where
504500
continue;
505501
};
506502

507-
let pvf = match sp_maybe_compressed_blob::decompress(
508-
&validation_code.0,
509-
VALIDATION_CODE_BOMB_LIMIT,
510-
) {
511-
Ok(code) => PvfPrepData::from_code(
512-
code.into_owned(),
513-
executor_params.clone(),
514-
timeout,
515-
PrepareJobKind::Prechecking,
516-
),
517-
Err(e) => {
518-
gum::debug!(target: LOG_TARGET, err=?e, "cannot decompress validation code");
519-
continue
520-
},
521-
};
503+
let pvf = PvfPrepData::from_code(
504+
validation_code.0,
505+
executor_params.clone(),
506+
timeout,
507+
PrepareJobKind::Prechecking,
508+
);
522509

523510
active_pvfs.push(pvf);
524511
processed_code_hashes.push(code_hash);
@@ -651,21 +638,12 @@ where
651638

652639
let timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Precheck);
653640

654-
let pvf = match sp_maybe_compressed_blob::decompress(
655-
&validation_code.0,
656-
VALIDATION_CODE_BOMB_LIMIT,
657-
) {
658-
Ok(code) => PvfPrepData::from_code(
659-
code.into_owned(),
660-
executor_params,
661-
timeout,
662-
PrepareJobKind::Prechecking,
663-
),
664-
Err(e) => {
665-
gum::debug!(target: LOG_TARGET, err=?e, "precheck: cannot decompress validation code");
666-
return PreCheckOutcome::Invalid
667-
},
668-
};
641+
let pvf = PvfPrepData::from_code(
642+
validation_code.0,
643+
executor_params,
644+
timeout,
645+
PrepareJobKind::Prechecking,
646+
);
669647

670648
match validation_backend.precheck_pvf(pvf).await {
671649
Ok(_) => PreCheckOutcome::Valid,
@@ -873,49 +851,15 @@ async fn validate_candidate_exhaustive(
873851
return Ok(ValidationResult::Invalid(e))
874852
}
875853

876-
let raw_validation_code = match sp_maybe_compressed_blob::decompress(
877-
&validation_code.0,
878-
VALIDATION_CODE_BOMB_LIMIT,
879-
) {
880-
Ok(code) => code,
881-
Err(e) => {
882-
gum::info!(target: LOG_TARGET, ?para_id, err=?e, "Invalid candidate (validation code)");
883-
884-
// Code already passed pre-checking, if decompression fails now this most likely means
885-
// some local corruption happened.
886-
return Err(ValidationFailed("Code decompression failed".to_string()))
887-
},
888-
};
889-
metrics.observe_code_size(raw_validation_code.len());
890-
891-
metrics.observe_pov_size(pov.block_data.0.len(), true);
892-
let raw_block_data =
893-
match sp_maybe_compressed_blob::decompress(&pov.block_data.0, POV_BOMB_LIMIT) {
894-
Ok(block_data) => BlockData(block_data.to_vec()),
895-
Err(e) => {
896-
gum::info!(target: LOG_TARGET, ?para_id, err=?e, "Invalid candidate (PoV code)");
897-
898-
// If the PoV is invalid, the candidate certainly is.
899-
return Ok(ValidationResult::Invalid(InvalidCandidate::PoVDecompressionFailure))
900-
},
901-
};
902-
metrics.observe_pov_size(raw_block_data.0.len(), false);
903-
904-
let params = ValidationParams {
905-
parent_head: persisted_validation_data.parent_head.clone(),
906-
block_data: raw_block_data,
907-
relay_parent_number: persisted_validation_data.relay_parent_number,
908-
relay_parent_storage_root: persisted_validation_data.relay_parent_storage_root,
909-
};
910-
854+
let persisted_validation_data = Arc::new(persisted_validation_data);
911855
let result = match exec_kind {
912856
// Retry is disabled to reduce the chance of nondeterministic blocks getting backed and
913857
// honest backers getting slashed.
914858
PvfExecKind::Backing => {
915859
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Prepare);
916860
let exec_timeout = pvf_exec_timeout(&executor_params, exec_kind);
917861
let pvf = PvfPrepData::from_code(
918-
raw_validation_code.to_vec(),
862+
validation_code.0,
919863
executor_params,
920864
prep_timeout,
921865
PrepareJobKind::Compilation,
@@ -925,17 +869,19 @@ async fn validate_candidate_exhaustive(
925869
.validate_candidate(
926870
pvf,
927871
exec_timeout,
928-
params.encode(),
872+
persisted_validation_data.clone(),
873+
pov,
929874
polkadot_node_core_pvf::Priority::Normal,
930875
)
931876
.await
932877
},
933878
PvfExecKind::Approval =>
934879
validation_backend
935880
.validate_candidate_with_retry(
936-
raw_validation_code.to_vec(),
881+
validation_code.0,
937882
pvf_exec_timeout(&executor_params, exec_kind),
938-
params,
883+
persisted_validation_data.clone(),
884+
pov,
939885
executor_params,
940886
PVF_APPROVAL_EXECUTION_RETRY_DELAY,
941887
polkadot_node_core_pvf::Priority::Critical,
@@ -961,6 +907,8 @@ async fn validate_candidate_exhaustive(
961907
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
962908
Err(ValidationError::Invalid(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
963909
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
910+
Err(ValidationError::Invalid(WasmInvalidCandidate::PoVDecompressionFailure)) =>
911+
Ok(ValidationResult::Invalid(InvalidCandidate::PoVDecompressionFailure)),
964912
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)) =>
965913
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
966914
"ambiguous worker death".to_string(),
@@ -1007,7 +955,7 @@ async fn validate_candidate_exhaustive(
1007955
// invalid.
1008956
Ok(ValidationResult::Invalid(InvalidCandidate::CommitmentsHashMismatch))
1009957
} else {
1010-
Ok(ValidationResult::Valid(outputs, persisted_validation_data))
958+
Ok(ValidationResult::Valid(outputs, (*persisted_validation_data).clone()))
1011959
}
1012960
},
1013961
}
@@ -1020,7 +968,8 @@ trait ValidationBackend {
1020968
&mut self,
1021969
pvf: PvfPrepData,
1022970
exec_timeout: Duration,
1023-
encoded_params: Vec<u8>,
971+
pvd: Arc<PersistedValidationData>,
972+
pov: Arc<PoV>,
1024973
// The priority for the preparation job.
1025974
prepare_priority: polkadot_node_core_pvf::Priority,
1026975
) -> Result<WasmValidationResult, ValidationError>;
@@ -1035,9 +984,10 @@ trait ValidationBackend {
1035984
/// preparation.
1036985
async fn validate_candidate_with_retry(
1037986
&mut self,
1038-
raw_validation_code: Vec<u8>,
987+
code: Vec<u8>,
1039988
exec_timeout: Duration,
1040-
params: ValidationParams,
989+
pvd: Arc<PersistedValidationData>,
990+
pov: Arc<PoV>,
1041991
executor_params: ExecutorParams,
1042992
retry_delay: Duration,
1043993
// The priority for the preparation job.
@@ -1046,7 +996,7 @@ trait ValidationBackend {
1046996
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepKind::Prepare);
1047997
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
1048998
let pvf = PvfPrepData::from_code(
1049-
raw_validation_code,
999+
code,
10501000
executor_params,
10511001
prep_timeout,
10521002
PrepareJobKind::Compilation,
@@ -1057,7 +1007,13 @@ trait ValidationBackend {
10571007

10581008
// Use `Priority::Critical` as finality trumps parachain liveliness.
10591009
let mut validation_result = self
1060-
.validate_candidate(pvf.clone(), exec_timeout, params.encode(), prepare_priority)
1010+
.validate_candidate(
1011+
pvf.clone(),
1012+
exec_timeout,
1013+
pvd.clone(),
1014+
pov.clone(),
1015+
prepare_priority,
1016+
)
10611017
.await;
10621018
if validation_result.is_ok() {
10631019
return validation_result
@@ -1130,10 +1086,14 @@ trait ValidationBackend {
11301086
validation_result
11311087
);
11321088

1133-
// Encode the params again when re-trying. We expect the retry case to be relatively
1134-
// rare, and we want to avoid unconditionally cloning data.
11351089
validation_result = self
1136-
.validate_candidate(pvf.clone(), new_timeout, params.encode(), prepare_priority)
1090+
.validate_candidate(
1091+
pvf.clone(),
1092+
new_timeout,
1093+
pvd.clone(),
1094+
pov.clone(),
1095+
prepare_priority,
1096+
)
11371097
.await;
11381098
}
11391099
}
@@ -1153,13 +1113,13 @@ impl ValidationBackend for ValidationHost {
11531113
&mut self,
11541114
pvf: PvfPrepData,
11551115
exec_timeout: Duration,
1156-
encoded_params: Vec<u8>,
1116+
pvd: Arc<PersistedValidationData>,
1117+
pov: Arc<PoV>,
11571118
// The priority for the preparation job.
11581119
prepare_priority: polkadot_node_core_pvf::Priority,
11591120
) -> Result<WasmValidationResult, ValidationError> {
11601121
let (tx, rx) = oneshot::channel();
1161-
if let Err(err) =
1162-
self.execute_pvf(pvf, exec_timeout, encoded_params, prepare_priority, tx).await
1122+
if let Err(err) = self.execute_pvf(pvf, exec_timeout, pvd, pov, prepare_priority, tx).await
11631123
{
11641124
return Err(InternalValidationError::HostCommunication(format!(
11651125
"cannot send pvf to the validation host, it might have shut down: {:?}",

polkadot/node/core/candidate-validation/src/metrics.rs

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ pub(crate) struct MetricsInner {
2323
pub(crate) validate_from_chain_state: prometheus::Histogram,
2424
pub(crate) validate_from_exhaustive: prometheus::Histogram,
2525
pub(crate) validate_candidate_exhaustive: prometheus::Histogram,
26-
pub(crate) pov_size: prometheus::HistogramVec,
27-
pub(crate) code_size: prometheus::Histogram,
2826
}
2927

3028
/// Candidate validation metrics.
@@ -70,21 +68,6 @@ impl Metrics {
7068
.as_ref()
7169
.map(|metrics| metrics.validate_candidate_exhaustive.start_timer())
7270
}
73-
74-
pub fn observe_code_size(&self, code_size: usize) {
75-
if let Some(metrics) = &self.0 {
76-
metrics.code_size.observe(code_size as f64);
77-
}
78-
}
79-
80-
pub fn observe_pov_size(&self, pov_size: usize, compressed: bool) {
81-
if let Some(metrics) = &self.0 {
82-
metrics
83-
.pov_size
84-
.with_label_values(&[if compressed { "true" } else { "false" }])
85-
.observe(pov_size as f64);
86-
}
87-
}
8871
}
8972

9073
impl metrics::Metrics for Metrics {
@@ -121,33 +104,6 @@ impl metrics::Metrics for Metrics {
121104
))?,
122105
registry,
123106
)?,
124-
pov_size: prometheus::register(
125-
prometheus::HistogramVec::new(
126-
prometheus::HistogramOpts::new(
127-
"polkadot_parachain_candidate_validation_pov_size",
128-
"The compressed and decompressed size of the proof of validity of a candidate",
129-
)
130-
.buckets(
131-
prometheus::exponential_buckets(16384.0, 2.0, 10)
132-
.expect("arguments are always valid; qed"),
133-
),
134-
&["compressed"],
135-
)?,
136-
registry,
137-
)?,
138-
code_size: prometheus::register(
139-
prometheus::Histogram::with_opts(
140-
prometheus::HistogramOpts::new(
141-
"polkadot_parachain_candidate_validation_code_size",
142-
"The size of the decompressed WASM validation blob used for checking a candidate",
143-
)
144-
.buckets(
145-
prometheus::exponential_buckets(16384.0, 2.0, 10)
146-
.expect("arguments are always valid; qed"),
147-
),
148-
)?,
149-
registry,
150-
)?,
151107
};
152108
Ok(Metrics(Some(metrics)))
153109
}

0 commit comments

Comments
 (0)