Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 3 additions & 10 deletions src/catalog-protos/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ rust_test(
compile_data = [],
crate = ":mz_catalog_protos",
crate_features = ["default"],
data = [] + glob(["protos/**"]),
data = [],
env = {},
lint_config = ":lints",
proc_macro_deps = [] + all_crate_deps(
Expand All @@ -71,7 +71,6 @@ rust_test(
version = "0.0.0",
deps = [
"//src/audit-log:mz_audit_log",
"//src/build-tools:mz_build_tools",
"//src/compute-types:mz_compute_types",
"//src/controller-types:mz_controller_types",
"//src/ore:mz_ore",
Expand All @@ -90,7 +89,6 @@ rust_doc_test(
crate = ":mz_catalog_protos",
deps = [
"//src/audit-log:mz_audit_log",
"//src/build-tools:mz_build_tools",
"//src/compute-types:mz_compute_types",
"//src/controller-types:mz_controller_types",
"//src/ore:mz_ore",
Expand All @@ -109,16 +107,11 @@ cargo_build_script(
srcs = ["build.rs"],
build_script_env = {},
compile_data = [],
data = [] + glob(["protos/**"]),
data = [],
proc_macro_deps = [] + all_crate_deps(build_proc_macro = True),
rustc_env = {},
rustc_flags = [],
deps = ["//src/build-tools:mz_build_tools"] + all_crate_deps(build = True),
)

filegroup(
name = "all_protos",
srcs = glob(["protos/**"]),
deps = [] + all_crate_deps(build = True),
)

extract_cargo_lints(
Expand Down
27 changes: 2 additions & 25 deletions src/catalog-protos/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,24 @@ mz-proto = { path = "../proto" }
mz-repr = { path = "../repr" }
mz-sql = { path = "../sql" }
mz-storage-types = { path = "../storage-types" }
paste = "1.0.11"
num_enum = "0.7.4"
proptest = { version = "1.7.0", default-features = false, features = ["std"] }
proptest-derive = { version = "0.5.1", features = ["boxed_union"] }
prost = "0.13.5"
proptest-derive = "0.5.1"
serde = { version = "1.0.219", features = ["derive"] }
workspace-hack = { version = "0.0.0", path = "../workspace-hack", optional = true }

[dev-dependencies]
mz-build-tools = { path = "../build-tools", default-features = false }
mz-ore = { path = "../ore", features = ["test"] }
mz-proto = { path = "../proto" }
proptest = { version = "1.7.0", default-features = false, features = ["std"] }
similar-asserts = "1.7"

[build-dependencies]
anyhow = "1.0.100"
md-5 = "0.10.6"
mz-build-tools = { path = "../build-tools", default-features = false, features = ["protobuf-src"] }
prost-build = "0.13.5"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.145"

[package.metadata.cargo-udeps.ignore]
normal = ["workspace-hack"]

[package.metadata.cargo-gazelle]
# TODO(parkmycar): Get rid of this hack by introducing a `catalog-types` crate.
# This exists because there is a test in the `catalog` crate (`test_proto_serialization_stability`)
# that needs both the protobuf definitions, but also some catalog types.
additive_content = """
filegroup(
name = "all_protos",
srcs = glob(["protos/**"]),
)
"""

[package.metadata.cargo-gazelle.build]
data = ["protos/**"]

[package.metadata.cargo-gazelle.test.lib]
data = ["protos/**"]

[features]
default = ["workspace-hack"]
137 changes: 28 additions & 109 deletions src/catalog-protos/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,47 +9,46 @@

use std::collections::BTreeMap;
use std::fs;
use std::io::{BufReader, Write};
use std::io::Write;
use std::path::PathBuf;

use anyhow::Context;
use md5::{Digest, Md5};
use serde::{Deserialize, Serialize};

/// The path of a protobuf file and its [`md5`] hash.
/// The path of an object definition file and its [`md5`] hash.
///
/// We store a hash of all the files to make sure they don't accidentally change, which would
/// invalidate our snapshotted types, and could silently introduce bugs.
#[derive(Debug, Clone, Deserialize, Serialize)]
struct ProtoHash {
struct ObjectsHash {
name: String,
md5: String,
}

const PROTO_DIRECTORY: &str = "protos";
const PROTO_HASHES: &str = "protos/hashes.json";
const OBJECTS_HASHES: &str = "objects_hashes.json";

fn main() -> anyhow::Result<()> {
println!("cargo:rerun-if-changed={PROTO_DIRECTORY}");
let crate_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

// Read in the persisted hashes from disk.
let hashes = fs::File::open(PROTO_HASHES).context("opening proto hashes")?;
let reader = BufReader::new(&hashes);
let hashes: Vec<ProtoHash> = serde_json::from_reader(reader)?;
let hashes_path = crate_root.join(OBJECTS_HASHES);
let hashes_json = fs::read_to_string(&hashes_path)?;
let hashes: Vec<ObjectsHash> = serde_json::from_str(&hashes_json)?;
let mut persisted: BTreeMap<String, String> =
hashes.into_iter().map(|e| (e.name, e.md5)).collect();

// Discover all of the protobuf files on disk.
let protos: BTreeMap<String, String> = fs::read_dir(PROTO_DIRECTORY)?
// Discover all of the object definition files on disk.
let src_dir = crate_root.join("src");
let objects: BTreeMap<String, String> = fs::read_dir(src_dir)?
// If we fail to read one file, fail everything.
.collect::<Result<Vec<_>, _>>()?
.into_iter()
// Filter to only files with the .proto extension.
// Filter to only files with the of the form `objects*.rs`.
.filter(|entry| {
entry
.path()
.extension()
.map(|e| e.to_string_lossy().contains("proto"))
.unwrap_or(false)
let name = entry.file_name();
let s = name.to_string_lossy();
s.starts_with("objects") && s.ends_with(".rs")
})
.map(|file| {
let path = file.path();
Expand All @@ -71,34 +70,34 @@ fn main() -> anyhow::Result<()> {
})
.collect();

// After validating our hashes we'll re-write the file if any new protos
// After validating our hashes we'll re-write the file if any new object definitions
// have been added.
let mut to_persist: Vec<ProtoHash> = Vec::new();
let mut to_persist: Vec<ObjectsHash> = Vec::new();
let mut any_new = false;

// Check the persisted hashes against what we just read in from disk.
for (name, hash) in protos {
for (name, hash) in objects {
match persisted.remove(&name) {
// Hashes have changed!
Some(og_hash) if hash != og_hash => {
anyhow::bail!(error_message(og_hash, hash, name));
}
// Found a proto file on disk that we didn't have persisted, we'll just persist it.
// Found an objects file on disk that we didn't have persisted, we'll just persist it.
None => {
to_persist.push(ProtoHash { name, md5: hash });
to_persist.push(ObjectsHash { name, md5: hash });
any_new = true;
}
// We match!
Some(_) => to_persist.push(ProtoHash { name, md5: hash }),
Some(_) => to_persist.push(ObjectsHash { name, md5: hash }),
}
}

// Check if there are any proto files we should have had hashes for, but didn't exist.
// Check if there are any objects files we should have had hashes for, but didn't exist.
if !persisted.is_empty() {
anyhow::bail!("Have persisted hashes, but no files on disk? {persisted:#?}");
}

// Write the hashes back out to disk if and only if there are new protos. We
// Write the hashes back out to disk if and only if there are new object definitions. We
// don't do this unconditionally or we'll get stuck in a rebuild loop:
// executing this build script will change the mtime on the hashes file,
// which will force the next compile to rebuild the crate, even if nothing
Expand All @@ -107,103 +106,23 @@ fn main() -> anyhow::Result<()> {
let mut file = fs::File::options()
.write(true)
.truncate(true)
.open(PROTO_HASHES)
.open(hashes_path)
.context("opening hashes file to write")?;
serde_json::to_writer_pretty(&mut file, &to_persist).context("persisting hashes")?;
write!(&mut file, "\n").context("writing newline")?;
}

// Generate protos!
let paths: Vec<_> = to_persist
.iter()
.map(|entry| format!("protos/{}", entry.name))
.collect();

const ATTR: &str = "#[derive(Eq, PartialOrd, Ord, ::serde::Serialize, ::serde::Deserialize)]";
const ARBITRARY_ATTR: &str = "#[derive(::proptest_derive::Arbitrary)]";

// 'as' is okay here because we're using it to define the type of the empty slice, which is
// necessary since the method takes the slice as a generic arg.
#[allow(clippy::as_conversions)]
// DO NOT change how JSON serialization works for these objects. The catalog relies on the JSON
// serialization of these objects remaining stable for a specific objects_vX version. If you
// want to change the JSON serialization format then follow these steps:
//
// 1. Create a new version of the `objects.proto` file.
// 2. Update the path of .proto files given to this compile block so that it is only the
// previous .proto files.
// 3. Add a new `prost_build::Config::new()...compile_protos(...)` block that only compiles
// the new and all future .proto files with the changed JSON serialization.
//
// Once we delete all the `.proto` that use the old JSON serialization, then we can delete
// the compile block for them as well.
prost_build::Config::new()
.protoc_executable(mz_build_tools::protoc())
.btree_map(["."])
.bytes(["."])
.message_attribute(".", ATTR)
// Note(parkmycar): This is annoying, but we need to manually specify each oneof so we can
// get them to implement Eq, PartialEq, and Ord. If you define a new oneof you should add
// it here.
.enum_attribute("CatalogItem.value", ATTR)
.enum_attribute("ClusterConfig.variant", ATTR)
.enum_attribute("GlobalId.value", ATTR)
.enum_attribute("CatalogItemId.value", ATTR)
.enum_attribute("ClusterId.value", ATTR)
.enum_attribute("DatabaseId.value", ATTR)
.enum_attribute("SchemaId.value", ATTR)
.enum_attribute("ReplicaId.value", ATTR)
.enum_attribute("RoleId.value", ATTR)
.enum_attribute("NetworkPolicyId.value", ATTR)
.enum_attribute("NetworkPolicyRule.action", ATTR)
.enum_attribute("NetworkPolicyRule.direction", ATTR)
.enum_attribute("ReplicaConfig.location", ATTR)
.enum_attribute("AuditLogEventV1.details", ATTR)
.enum_attribute("AuditLogKey.event", ATTR)
.enum_attribute("StorageUsageKey.usage", ATTR)
.enum_attribute("ResolvedDatabaseSpecifier.value", ATTR)
.enum_attribute("CommentKey.object", ATTR)
.enum_attribute("CommentKey.sub_component", ATTR)
.enum_attribute("ResolvedDatabaseSpecifier.spec", ATTR)
.enum_attribute("SchemaSpecifier.spec", ATTR)
.enum_attribute("RoleVars.Entry.val", ATTR)
.enum_attribute("StateUpdateKind.kind", ATTR)
.enum_attribute("ClusterScheduleOptionValue.value", ATTR)
.enum_attribute("ClusterSchedule.value", ATTR)
.enum_attribute("CreateOrDropClusterReplicaReasonV1.reason", ATTR)
.enum_attribute("RefreshDecisionWithReasonV1.decision", ATTR)
.enum_attribute("RefreshDecisionWithReasonV2.decision", ATTR)
// Serialize/deserialize the top-level enum in the persist-backed
// catalog as "internally tagged"[^1] to set up persist pushdown
// statistics for success.
//
// [^1]: https://serde.rs/enum-representations.html#internally-tagged
.enum_attribute("StateUpdateKind.kind", "#[serde(tag = \"kind\")]")
// We derive Arbitrary for all protobuf types for wire compatibility testing.
.message_attribute(".", ARBITRARY_ATTR)
.enum_attribute(".", ARBITRARY_ATTR)
.compile_protos(
&paths,
&[ /*
This is purposefully empty, and we should never
add any includes because we don't want to allow
our protos to have dependencies. This allows us
to ensure our snapshots can't silently change.
*/
] as &[&str],
)?;

Ok(())
}

/// A (hopefully) helpful error message that describes what to do when the hashes differ.
fn error_message(og_hash: String, hash: String, filename: String) -> String {
let title = "Hashes changed for the persisted protobuf files!";
let title = "Hashes changed for the persisted object definition files!";
let body1 = format!(
"If you changed '{filename}' without first making a snapshot, then you need to copy '{filename}' and rename it with a suffix like '_vX.proto'."
"If you changed '{filename}' without first making a snapshot, then you need to copy '{filename}' and rename it with a suffix like '_vX.rs'."
);
let body2 = format!(
"Otherwise you can update the hash for '{filename}' in '{PROTO_HASHES}' to be '{hash}'."
"Otherwise you can update the hash for '{filename}' in '{OBJECTS_HASHES}' to be '{hash}'."
);
let hashes = format!("persisted_hash({og_hash}) != current_hash({hash})\nFile: {filename}");

Expand Down
50 changes: 50 additions & 0 deletions src/catalog-protos/objects_hashes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
[
{
"name": "objects.rs",
"md5": "29f67ac98b87677b1614e903fdcdcc3c"
},
{
"name": "objects_v67.rs",
"md5": "f48b00a2411914ae34c9302239b05f45"
},
{
"name": "objects_v68.rs",
"md5": "3c8dc82e6b72b779421fbc149bb32ff1"
},
{
"name": "objects_v69.rs",
"md5": "5cb8a1457a03469a775fe69ea6c73f98"
},
{
"name": "objects_v70.rs",
"md5": "b309df83ddd6674155f6e775ff4482ae"
},
{
"name": "objects_v71.rs",
"md5": "b309df83ddd6674155f6e775ff4482ae"
},
{
"name": "objects_v72.rs",
"md5": "b309df83ddd6674155f6e775ff4482ae"
},
{
"name": "objects_v73.rs",
"md5": "f6419eac6283d905f60d1d8ef6d3f9c1"
},
{
"name": "objects_v74.rs",
"md5": "2b7edb2f05ab659498c9012f68455d4d"
},
{
"name": "objects_v75.rs",
"md5": "1f19ff5f0926ca340f28727b93c4fe98"
},
{
"name": "objects_v76.rs",
"md5": "29f67ac98b87677b1614e903fdcdcc3c"
},
{
"name": "objects_v77.rs",
"md5": "29f67ac98b87677b1614e903fdcdcc3c"
}
]
Loading