Skip to content

Commit 18b9463

Browse files
authored
[turbopack] add a new hasher implementation to eliminate allocations (vercel#89059)
We only store hashes for TaskTypes in the TaskCache keyspace, so this introduces an allocation free path to compute it. For writing out the TaskCache this wont make much of a difference since we were reusing a scratch buffer, but this should eliminate a source of allocations from the read path. This does of course cause a small binary size regression `157M (160,952K) -> 158M (161,800K) = +1M (+848K)`
1 parent 32d8a9b commit 18b9463

File tree

7 files changed

+94
-41
lines changed

7 files changed

+94
-41
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

turbopack/crates/turbo-bincode/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ mime = { workspace = true }
1818
ringmap = { workspace = true }
1919
serde = { workspace = true }
2020
smallvec = { workspace = true }
21+
turbo-tasks-hash = { workspace = true }
2122
unty = { workspace = true }

turbopack/crates/turbo-bincode/src/lib.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,16 @@ use bincode::{
1111
enc::{Encoder, EncoderImpl, write::Writer},
1212
error::{DecodeError, EncodeError},
1313
};
14+
use turbo_tasks_hash::DeterministicHasher;
1415

1516
pub const TURBO_BINCODE_CONFIG: bincode::config::Configuration = bincode::config::standard();
17+
/// Same as standard config, but since we aren't encoding we don't benefit from varint
18+
/// optimizations.
19+
pub const TURBO_BINCODE_HASH_CONFIG: bincode::config::Configuration<
20+
bincode::config::LittleEndian,
21+
bincode::config::Fixint,
22+
bincode::config::NoLimit,
23+
> = TURBO_BINCODE_CONFIG.with_fixed_int_encoding();
1624
pub type TurboBincodeBuffer = SmallVec<[u8; 16]>;
1725
pub type TurboBincodeEncoder<'a> =
1826
EncoderImpl<TurboBincodeWriter<'a>, bincode::config::Configuration>;
@@ -121,6 +129,44 @@ impl Reader for TurboBincodeReader<'_> {
121129
}
122130
}
123131

132+
/// A [`Writer`] that sinks bytes directly into a [`DeterministicHasher`] instead of a buffer.
133+
///
134+
/// This allows encoding values directly to a hash without intermediate buffer allocation.
135+
pub struct HashWriter<'a, H: DeterministicHasher + ?Sized> {
136+
hasher: &'a mut H,
137+
}
138+
139+
impl<'a, H: DeterministicHasher + ?Sized> HashWriter<'a, H> {
140+
/// Creates a new `HashWriter` that writes to the given hasher.
141+
pub fn new(hasher: &'a mut H) -> Self {
142+
Self { hasher }
143+
}
144+
}
145+
146+
impl<H: DeterministicHasher + ?Sized> Writer for HashWriter<'_, H> {
147+
fn write(&mut self, bytes: &[u8]) -> Result<(), EncodeError> {
148+
self.hasher.write_bytes(bytes);
149+
Ok(())
150+
}
151+
}
152+
153+
/// An encoder that writes directly to a [`DeterministicHasher`].
154+
pub type HashEncoder<'a, H> = EncoderImpl<
155+
HashWriter<'a, H>,
156+
bincode::config::Configuration<
157+
bincode::config::LittleEndian,
158+
bincode::config::Fixint,
159+
bincode::config::NoLimit,
160+
>,
161+
>;
162+
163+
/// Creates a new [`HashEncoder`] that encodes directly to the given hasher.
164+
///
165+
/// This is useful for computing hashes of encoded values without allocating a buffer.
166+
pub fn new_hash_encoder<H: DeterministicHasher + ?Sized>(hasher: &mut H) -> HashEncoder<'_, H> {
167+
EncoderImpl::new(HashWriter::new(hasher), TURBO_BINCODE_HASH_CONFIG)
168+
}
169+
124170
/// Represents a type that can only be encoded with a [`TurboBincodeEncoder`].
125171
///
126172
/// All traits implementing this must also implement the more generic [`Encode`] trait, but they

turbopack/crates/turbo-tasks-backend/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ turbo-bincode = { workspace = true }
6161
turbo-persistence = { workspace = true }
6262
turbo-rcstr = { workspace = true }
6363
turbo-tasks = { workspace = true }
64+
turbo-tasks-hash = { workspace = true }
6465

6566
[dev-dependencies]
6667
criterion = { workspace = true, features = ["async_tokio"] }

turbopack/crates/turbo-tasks-backend/src/kv_backing_storage.rs

Lines changed: 13 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ use anyhow::{Context, Result};
99
use smallvec::SmallVec;
1010
use turbo_bincode::{
1111
TurboBincodeBuffer, new_turbo_bincode_decoder, turbo_bincode_decode, turbo_bincode_encode,
12-
turbo_bincode_encode_into,
1312
};
1413
use turbo_tasks::{
1514
TaskId,
1615
backend::CachedTaskType,
1716
panic_hooks::{PanicHookGuard, register_panic_hook},
1817
parallel,
1918
};
19+
use turbo_tasks_hash::Xxh3Hash64Hasher;
2020

2121
use crate::{
2222
GitVersionInfo,
@@ -269,11 +269,6 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
269269
{
270270
let _span = tracing::info_span!("save snapshot", operations = operations.len()).entered();
271271
let mut batch = self.inner.database.write_batch()?;
272-
273-
// these buffers should be large, because they're temporary and re-used.
274-
// From measuring a large application the largest TaskType was ~365b, so this should be big
275-
// enough to trigger no resizes in the loop.
276-
const INITIAL_ENCODE_BUFFER_CAPACITY: usize = 512;
277272
// Start organizing the updates in parallel
278273
match &mut batch {
279274
&mut WriteBatch::Concurrent(ref batch, _) => {
@@ -308,14 +303,8 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
308303
|updates| {
309304
let _span = _span.clone().entered();
310305
let mut max_task_id = 0;
311-
312-
// Re-use the same buffer across every `compute_task_type_hash` call in
313-
// this chunk. `ConcurrentWriteBatch::put` will copy the data out of
314-
// this buffer into smaller exact-sized vecs.
315-
let mut task_type_bytes =
316-
TurboBincodeBuffer::with_capacity(INITIAL_ENCODE_BUFFER_CAPACITY);
317306
for (task_type, task_id) in updates {
318-
let hash = compute_task_type_hash(&task_type, &mut task_type_bytes);
307+
let hash = compute_task_type_hash(&task_type);
319308
let task_id: u32 = *task_id;
320309

321310
batch
@@ -385,13 +374,8 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
385374
items = task_cache_updates.iter().map(|m| m.len()).sum::<usize>()
386375
)
387376
.entered();
388-
// Re-use the same buffer across every `serialize_task_type` call.
389-
// `ConcurrentWriteBatch::put` will copy the data out of this buffer into
390-
// smaller exact-sized vecs.
391-
let mut task_type_bytes =
392-
TurboBincodeBuffer::with_capacity(INITIAL_ENCODE_BUFFER_CAPACITY);
393377
for (task_type, task_id) in task_cache_updates.into_iter().flatten() {
394-
let hash = compute_task_type_hash(&task_type, &mut task_type_bytes);
378+
let hash = compute_task_type_hash(&task_type);
395379
let task_id = *task_id;
396380

397381
batch
@@ -437,7 +421,7 @@ impl<T: KeyValueDatabase + Send + Sync + 'static> BackingStorageSealed
437421
tx: &D::ReadTransaction<'_>,
438422
task_type: &CachedTaskType,
439423
) -> Result<SmallVec<[TaskId; 1]>> {
440-
let hash = compute_task_type_hash(task_type, &mut TurboBincodeBuffer::new());
424+
let hash = compute_task_type_hash(task_type);
441425
let buffers = database.get_multiple(tx, KeySpace::TaskCache, &hash.to_le_bytes())?;
442426

443427
let mut task_ids = SmallVec::with_capacity(buffers.len());
@@ -590,28 +574,18 @@ where
590574

591575
/// Computes a deterministic 64-bit hash of a CachedTaskType for use as a TaskCache key.
592576
///
593-
/// This uses the existing TurboBincodeEncode implementation which is deterministic
594-
/// (function IDs from registry, bincode argument encoding), then hashes the result
595-
/// with XxHash64.
596-
fn compute_task_type_hash(
597-
task_type: &CachedTaskType,
598-
scratch_buffer: &mut TurboBincodeBuffer,
599-
) -> u64 {
600-
// TODO: use a custom encoder that can directly hash without filling a buffer
601-
// This should not fail for valid task types - the encoding is deterministic
602-
turbo_bincode_encode_into(task_type, scratch_buffer)
603-
.expect("CachedTaskType encoding should not fail");
604-
let hash = turbo_persistence::hash_key(&scratch_buffer.as_slice());
605-
scratch_buffer.clear();
606-
577+
/// This encodes the task type directly to a hasher, avoiding intermediate buffer allocation.
578+
/// The encoding is deterministic (function IDs from registry, bincode argument encoding).
579+
fn compute_task_type_hash(task_type: &CachedTaskType) -> u64 {
580+
let mut hasher = Xxh3Hash64Hasher::new();
581+
task_type.hash_encode(&mut hasher);
582+
let hash = hasher.finish();
607583
if cfg!(feature = "verify_serialization") {
608-
turbo_bincode_encode_into(task_type, scratch_buffer)
609-
.expect("CachedTaskType encoding should not fail");
610-
let hash2 = turbo_persistence::hash_key(&scratch_buffer.as_slice());
611-
scratch_buffer.clear();
584+
task_type.hash_encode(&mut hasher);
585+
let hash2 = hasher.finish();
612586
assert_eq!(
613587
hash, hash2,
614-
"Encoding TaskType twice was non-deterministic: \n{:?}\ngot hashes {} != {}",
588+
"Hashing TaskType twice was non-deterministic: \n{:?}\ngot hashes {} != {}",
615589
task_type, hash, hash2
616590
);
617591
}

turbopack/crates/turbo-tasks/src/backend.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ use smallvec::SmallVec;
2222
use tracing::Span;
2323
use turbo_bincode::{
2424
TurboBincodeDecode, TurboBincodeDecoder, TurboBincodeEncode, TurboBincodeEncoder,
25-
impl_decode_for_turbo_bincode_decode, impl_encode_for_turbo_bincode_encode,
25+
impl_decode_for_turbo_bincode_decode, impl_encode_for_turbo_bincode_encode, new_hash_encoder,
2626
};
2727
use turbo_rcstr::RcStr;
28+
use turbo_tasks_hash::DeterministicHasher;
2829

2930
use crate::{
3031
RawVc, ReadCellOptions, ReadOutputOptions, ReadRef, SharedReference, TaskId, TaskIdSet,
@@ -81,6 +82,20 @@ impl CachedTaskType {
8182
pub fn get_name(&self) -> &'static str {
8283
self.native_fn.name
8384
}
85+
86+
/// Encodes this task type directly to a hasher, avoiding buffer allocation.
87+
///
88+
/// This uses the same encoding logic as [`TurboBincodeEncode`] but writes
89+
/// directly to a [`DeterministicHasher`] instead of a buffer.
90+
pub fn hash_encode<H: DeterministicHasher>(&self, hasher: &mut H) {
91+
let fn_id = registry::get_function_id(self.native_fn);
92+
{
93+
let mut encoder = new_hash_encoder(hasher);
94+
Encode::encode(&fn_id, &mut encoder).expect("fn_id encoding should not fail");
95+
Encode::encode(&self.this, &mut encoder).expect("this encoding should not fail");
96+
}
97+
(self.native_fn.arg_meta.hash_encode)(&*self.arg, hasher);
98+
}
8499
}
85100

86101
impl TurboBincodeEncode for CachedTaskType {

turbopack/crates/turbo-tasks/src/native_function.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ use bincode::{Decode, Encode};
55
use futures::Future;
66
use once_cell::sync::Lazy;
77
use tracing::Span;
8-
use turbo_bincode::{AnyDecodeFn, AnyEncodeFn};
8+
use turbo_bincode::{AnyDecodeFn, AnyEncodeFn, new_hash_encoder};
9+
use turbo_tasks_hash::DeterministicHasher;
910

1011
use crate::{
1112
RawVc, TaskExecutionReason, TaskInput, TaskPersistence, TaskPriority,
@@ -24,9 +25,17 @@ type IsResolvedFunctor = fn(&dyn MagicAny) -> bool;
2425
type FilterOwnedArgsFunctor = for<'a> fn(Box<dyn MagicAny>) -> Box<dyn MagicAny>;
2526
type FilterAndResolveFunctor = ResolveFunctor;
2627

28+
/// Function pointer that encodes a task argument directly to a hasher.
29+
///
30+
/// This allows computing hashes of task arguments without intermediate buffer allocation.
31+
pub type AnyHashEncodeFn = fn(&dyn Any, &mut dyn DeterministicHasher);
32+
2733
pub struct ArgMeta {
2834
// TODO: This should be an `Option` with `None` for transient tasks. We can skip some codegen.
2935
pub bincode: (AnyEncodeFn, AnyDecodeFn<Box<dyn MagicAny>>),
36+
/// Encodes the argument directly to a hasher, avoiding buffer allocation.
37+
/// Uses the same encoding logic as bincode but writes to a [`DeterministicHasher`].
38+
pub hash_encode: AnyHashEncodeFn,
3039
is_resolved: IsResolvedFunctor,
3140
resolve: ResolveFunctor,
3241
/// Used for trait methods, filters out unused arguments.
@@ -70,6 +79,11 @@ impl ArgMeta {
7079
Ok(Box::new(val))
7180
},
7281
),
82+
hash_encode: |this, hasher| {
83+
let mut encoder = new_hash_encoder(hasher);
84+
T::encode(any_as_encode::<T>(this), &mut encoder)
85+
.expect("encoding to hasher should not fail");
86+
},
7387
is_resolved: |value| downcast_args_ref::<T>(value).is_resolved(),
7488
resolve: resolve_functor_impl::<T>,
7589
filter_owned,

0 commit comments

Comments
 (0)