diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d3d3839..edaec267 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file. - Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`. - The operator now defaults to `AES/CTR/NoPadding` for `dfs.encrypt.data.transfer.cipher.suite` to improve security and performance ([#693]). - The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]). +- Added several properties to `hdfs-site.xml` and `core-site.xml` that improve general performance and reliability ([#696]) ### Changed @@ -50,6 +51,7 @@ All notable changes to this project will be documented in this file. [#684]: https://github.com/stackabletech/hdfs-operator/pull/684 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693 [#695]: https://github.com/stackabletech/hdfs-operator/pull/695 +[#696]: https://github.com/stackabletech/hdfs-operator/pull/696 ## [25.3.0] - 2025-03-21 diff --git a/deploy/helm/hdfs-operator/templates/roles.yaml b/deploy/helm/hdfs-operator/templates/roles.yaml index fbc1b8c7..2b846ea5 100644 --- a/deploy/helm/hdfs-operator/templates/roles.yaml +++ b/deploy/helm/hdfs-operator/templates/roles.yaml @@ -186,6 +186,8 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{ include "operator.name" . }}-clusterrole-nodes + labels: + {{- include "operator.labels" . | nindent 4 }} rules: - apiGroups: - "" diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 6394276d..2b388e8c 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -664,7 +664,41 @@ fn rolegroup_config_map( ) .add("dfs.datanode.registered.hostname", "${env.POD_ADDRESS}") .add("dfs.datanode.registered.port", "${env.DATA_PORT}") - .add("dfs.datanode.registered.ipc.port", "${env.IPC_PORT}"); + .add("dfs.datanode.registered.ipc.port", "${env.IPC_PORT}") + // The following two properties are set to "true" because there is a minor chance that data + // written to HDFS is not synced to disk even if a block has been closed. + // Users in HBase can control this explicitly for the WAL, but for flushes and compactions + // I believe they can't as easily (if at all). + // In theory, HBase should be able to recover from these failures, but that comes at a cost + // and there's always a risk. + // Enabling this behavior causes HDFS to sync to disk as soon as possible. + .add("dfs.datanode.sync.behind.writes", "true") + .add("dfs.datanode.synconclose", "true") + // Defaults to 10 since at least 2011. + // This controls the concurrent number of client connections (this includes DataNodes) + // to the NameNode. Ideally, we'd scale this with the number of DataNodes but this would + // lead to restarts of the NameNode. + // This should lead to better performance due to more concurrency. + .add("dfs.namenode.handler.count", "50") + // Defaults to 10 since at least 2012. + // This controls the concurrent number of client connections to the DataNodes. + // We have no idea how many clients there may be, so it's hard to assign a good default. + // Increasing to 50 should lead to better performance due to more concurrency, especially + // with use-cases like HBase. + .add("dfs.datanode.handler.count", "50") + // The following two properties default to 2 and 4 respectively since around 2013. + // They control the number of maximum replication "jobs" a NameNode assigns to + // a DataNode in a single heartbeat. + // Increasing this number will increase network usage during replication events + // but can lead to faster recovery. + .add("dfs.namenode.replication.max-streams", "4") + .add("dfs.namenode.replication.max-streams-hard-limit", "8") + // Defaults to 4096 and hasn't changed since at least 2011. + // The number of threads used for actual data transfer, so not very CPU heavy + // but IO bound. This is why the number is relatively high. + // But today's Java and IO should be able to handle more, so bump it to 8192 for + // better performance/concurrency. + .add("dfs.datanode.max.transfer.threads", "8192"); if hdfs.has_https_enabled() { hdfs_site.add("dfs.datanode.registered.https.port", "${env.HTTPS_PORT}"); } else { @@ -683,7 +717,10 @@ fn rolegroup_config_map( .ha_zookeeper_quorum() .security_config(hdfs, cluster_info) .context(BuildSecurityConfigSnafu)? - .enable_prometheus_endpoint(); + .enable_prometheus_endpoint() + // The default (4096) hasn't changed since 2009. + // Increase to 128k to allow for faster transfers. + .add("io.file.buffer.size", "131072"); if let Some(hdfs_opa_config) = hdfs_opa_config { hdfs_opa_config.add_core_site_config(&mut core_site); }