Skip to content

Commit 9e0148d

Browse files
authored
safekeeper: use protobuf for sending compressed records to pageserver (#9821)
## Problem #9746 lifted decoding and interpretation of WAL to the safekeeper. This reduced the ingested amount on the pageservers by around 10x for a tenant with 8 shards, but doubled the ingested amount for single sharded tenants. Also, #9746 uses bincode which doesn't support schema evolution. Technically the schema can be evolved, but it's very cumbersome. ## Summary of changes This patch set addresses both problems by adding protobuf support for the interpreted wal records and adding compression support. Compressed protobuf reduced the ingested amount by 100x on the 32 shards `test_sharded_ingest` case (compared to non-interpreted proto). For the 1 shard case the reduction is 5x. Sister change to `rust-postgres` is [here](neondatabase/rust-postgres#33). ## Links Related: #9336 Epic: #9329
1 parent 7b41ee8 commit 9e0148d

File tree

21 files changed

+702
-106
lines changed

21 files changed

+702
-106
lines changed

Cargo.lock

Lines changed: 10 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

libs/pageserver_api/src/key.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,18 @@ impl Key {
229229
}
230230
}
231231

232+
impl CompactKey {
233+
pub fn raw(&self) -> i128 {
234+
self.0
235+
}
236+
}
237+
238+
impl From<i128> for CompactKey {
239+
fn from(value: i128) -> Self {
240+
Self(value)
241+
}
242+
}
243+
232244
impl fmt::Display for Key {
233245
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
234246
write!(

libs/pq_proto/src/lib.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -688,9 +688,6 @@ pub struct InterpretedWalRecordsBody<'a> {
688688
pub streaming_lsn: u64,
689689
/// Current end of WAL on the server
690690
pub commit_lsn: u64,
691-
/// Start LSN of the next record in PG WAL.
692-
/// Is 0 if the portion of PG WAL did not contain any records.
693-
pub next_record_lsn: u64,
694691
pub data: &'a [u8],
695692
}
696693

@@ -1028,7 +1025,6 @@ impl BeMessage<'_> {
10281025
// dependency
10291026
buf.put_u64(rec.streaming_lsn);
10301027
buf.put_u64(rec.commit_lsn);
1031-
buf.put_u64(rec.next_record_lsn);
10321028
buf.put_slice(rec.data);
10331029
});
10341030
}

libs/utils/src/postgres_client.rs

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,31 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
77

88
use crate::id::TenantTimelineId;
99

10-
/// Postgres client protocol types
11-
#[derive(
12-
Copy,
13-
Clone,
14-
PartialEq,
15-
Eq,
16-
strum_macros::EnumString,
17-
strum_macros::Display,
18-
serde_with::DeserializeFromStr,
19-
serde_with::SerializeDisplay,
20-
Debug,
21-
)]
22-
#[strum(serialize_all = "kebab-case")]
23-
#[repr(u8)]
10+
#[derive(Copy, Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
11+
#[serde(rename_all = "kebab-case")]
12+
pub enum InterpretedFormat {
13+
Bincode,
14+
Protobuf,
15+
}
16+
17+
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
18+
#[serde(rename_all = "kebab-case")]
19+
pub enum Compression {
20+
Zstd { level: i8 },
21+
}
22+
23+
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
24+
#[serde(tag = "type", content = "args")]
25+
#[serde(rename_all = "kebab-case")]
2426
pub enum PostgresClientProtocol {
2527
/// Usual Postgres replication protocol
2628
Vanilla,
2729
/// Custom shard-aware protocol that replicates interpreted records.
2830
/// Used to send wal from safekeeper to pageserver.
29-
Interpreted,
30-
}
31-
32-
impl TryFrom<u8> for PostgresClientProtocol {
33-
type Error = u8;
34-
35-
fn try_from(value: u8) -> Result<Self, Self::Error> {
36-
Ok(match value {
37-
v if v == (PostgresClientProtocol::Vanilla as u8) => PostgresClientProtocol::Vanilla,
38-
v if v == (PostgresClientProtocol::Interpreted as u8) => {
39-
PostgresClientProtocol::Interpreted
40-
}
41-
x => return Err(x),
42-
})
43-
}
31+
Interpreted {
32+
format: InterpretedFormat,
33+
compression: Option<Compression>,
34+
},
4435
}
4536

4637
pub struct ConnectionConfigArgs<'a> {
@@ -63,7 +54,10 @@ impl<'a> ConnectionConfigArgs<'a> {
6354
"-c".to_owned(),
6455
format!("timeline_id={}", self.ttid.timeline_id),
6556
format!("tenant_id={}", self.ttid.tenant_id),
66-
format!("protocol={}", self.protocol as u8),
57+
format!(
58+
"protocol={}",
59+
serde_json::to_string(&self.protocol).unwrap()
60+
),
6761
];
6862

6963
if self.shard_number.is_some() {

libs/wal_decoder/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,19 @@ license.workspace = true
88
testing = ["pageserver_api/testing"]
99

1010
[dependencies]
11+
async-compression.workspace = true
1112
anyhow.workspace = true
1213
bytes.workspace = true
1314
pageserver_api.workspace = true
15+
prost.workspace = true
1416
postgres_ffi.workspace = true
1517
serde.workspace = true
18+
thiserror.workspace = true
19+
tokio = { workspace = true, features = ["io-util"] }
20+
tonic.workspace = true
1621
tracing.workspace = true
1722
utils.workspace = true
1823
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
24+
25+
[build-dependencies]
26+
tonic-build.workspace = true

libs/wal_decoder/build.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
fn main() -> Result<(), Box<dyn std::error::Error>> {
2+
// Generate rust code from .proto protobuf.
3+
//
4+
// Note: we previously tried to use deterministic location at proto/ for
5+
// easy location, but apparently interference with cachepot sometimes fails
6+
// the build then. Anyway, per cargo docs build script shouldn't output to
7+
// anywhere but $OUT_DIR.
8+
tonic_build::compile_protos("proto/interpreted_wal.proto")
9+
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
10+
Ok(())
11+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
syntax = "proto3";
2+
3+
package interpreted_wal;
4+
5+
message InterpretedWalRecords {
6+
repeated InterpretedWalRecord records = 1;
7+
optional uint64 next_record_lsn = 2;
8+
}
9+
10+
message InterpretedWalRecord {
11+
optional bytes metadata_record = 1;
12+
SerializedValueBatch batch = 2;
13+
uint64 next_record_lsn = 3;
14+
bool flush_uncommitted = 4;
15+
uint32 xid = 5;
16+
}
17+
18+
message SerializedValueBatch {
19+
bytes raw = 1;
20+
repeated ValueMeta metadata = 2;
21+
uint64 max_lsn = 3;
22+
uint64 len = 4;
23+
}
24+
25+
enum ValueMetaType {
26+
Serialized = 0;
27+
Observed = 1;
28+
}
29+
30+
message ValueMeta {
31+
ValueMetaType type = 1;
32+
CompactKey key = 2;
33+
uint64 lsn = 3;
34+
optional uint64 batch_offset = 4;
35+
optional uint64 len = 5;
36+
optional bool will_init = 6;
37+
}
38+
39+
message CompactKey {
40+
int64 high = 1;
41+
int64 low = 2;
42+
}
43+

libs/wal_decoder/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pub mod decoder;
22
pub mod models;
33
pub mod serialized_batch;
4+
pub mod wire_format;

libs/wal_decoder/src/models.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,32 @@ use utils::lsn::Lsn;
3737

3838
use crate::serialized_batch::SerializedValueBatch;
3939

40+
// Code generated by protobuf.
41+
pub mod proto {
42+
// Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
43+
// we don't use these types for anything but broker data transmission,
44+
// so it's ok to ignore this one.
45+
#![allow(clippy::derive_partial_eq_without_eq)]
46+
// The generated ValueMeta has a `len` method generate for its `len` field.
47+
#![allow(clippy::len_without_is_empty)]
48+
tonic::include_proto!("interpreted_wal");
49+
}
50+
4051
#[derive(Serialize, Deserialize)]
4152
pub enum FlushUncommittedRecords {
4253
Yes,
4354
No,
4455
}
4556

57+
/// A batch of interpreted WAL records
58+
#[derive(Serialize, Deserialize)]
59+
pub struct InterpretedWalRecords {
60+
pub records: Vec<InterpretedWalRecord>,
61+
// Start LSN of the next record after the batch.
62+
// Note that said record may not belong to the current shard.
63+
pub next_record_lsn: Option<Lsn>,
64+
}
65+
4666
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
4767
#[derive(Serialize, Deserialize)]
4868
pub struct InterpretedWalRecord {

0 commit comments

Comments
 (0)