awslabs
diff --git a/‎analytics/terraform/spark-k8s-operator/addons.tf
Lines changed: 1 addition & 1 deletion b/‎analytics/terraform/spark-k8s-operator/addons.tf
Lines changed: 1 addition & 1 deletion
diff --git a/‎analytics/terraform/spark-k8s-operator/examples/beam/Dockerfile
Lines changed: 50 additions & 0 deletions b/‎analytics/terraform/spark-k8s-operator/examples/beam/Dockerfile
Lines changed: 50 additions & 0 deletions
diff --git a/‎analytics/terraform/spark-k8s-operator/examples/beam/beamapp.yaml
Lines changed: 55 additions & 0 deletions b/‎analytics/terraform/spark-k8s-operator/examples/beam/beamapp.yaml
Lines changed: 55 additions & 0 deletions
diff --git a/‎analytics/terraform/spark-k8s-operator/examples/beam/wordcount.py
Lines changed: 116 additions & 0 deletions b/‎analytics/terraform/spark-k8s-operator/examples/beam/wordcount.py
Lines changed: 116 additions & 0 deletions
diff --git a/‎distributed-databases/trino/helm-values/trino.yaml
Lines changed: 12 additions & 12 deletions b/‎distributed-databases/trino/helm-values/trino.yaml
Lines changed: 12 additions & 12 deletions
diff --git a/‎distributed-databases/trino/karpenter.tf
Lines changed: 49 additions & 2 deletions b/‎distributed-databases/trino/karpenter.tf
Lines changed: 49 additions & 2 deletions
@@ -959,7 +959,7 @@ resource "random_password" "grafana" {
 
 #tfsec:ignore:aws-ssm-secret-use-customer-key
 resource "aws_secretsmanager_secret" "grafana" {
-  name                    = "${local.name}-grafana"
+  name_prefix             = "${local.name}-grafana-"
   recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
 }
 
 
@@ -0,0 +1,50 @@
+FROM apache/spark:3.5.6-scala2.12-java17-python3-ubuntu
+
+ARG PYTHON_VERSION=3.11.3
+ARG BEAM_VERSION=2.58.0
+ARG HADOOP_VERSION=3.4.1
+ARG AWS_SDK_VERSION=2.29.0
+ARG SPARK_UID=185
+
+ENV SPARK_HOME=/opt/spark
+
+# Set up as root to install dependencies and tools
+USER root
+
+# Remove any old Hadoop libraries to avoid conflicts
+RUN rm -f ${SPARK_HOME}/jars/hadoop-client-* && \
+    rm -f ${SPARK_HOME}/jars/hadoop-yarn-server-web-proxy-*.jar
+
+# Add Hadoop AWS connector and related Hadoop dependencies
+RUN cd ${SPARK_HOME}/jars && \
+    wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \
+    wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar && \
+    wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar && \
+    wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar && \
+    wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-yarn-server-web-proxy/${HADOOP_VERSION}/hadoop-yarn-server-web-proxy-${HADOOP_VERSION}.jar && \
+    wget https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar
+
+RUN apt-get update && \
+    apt-get install -y gcc libssl-dev lzma liblzma-dev libbz2-dev libffi-dev tar gzip wget make && \
+    wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
+    tar xzf Python-${PYTHON_VERSION}.tgz && \
+    cd Python-${PYTHON_VERSION} && \
+    ./configure --enable-optimizations && \
+    make install
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV --copies
+RUN cp -r /usr/local/lib/python3.11/* $VIRTUAL_ENV/lib/python3.11
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install apache_beam==${BEAM_VERSION} \
+    s3fs \
+    boto3
+ENV PYSPARK_PYTHON="/opt/venv/bin/python3"
+ENV PYSPARK_DRIVER_PYTHON="/opt/venv/bin/python3"
+ENV RUN_PYTHON_SDK_IN_DEFAULT_ENVIRONMENT=1
+COPY --from=apache/beam_python3.11_sdk:2.58.0 /opt/apache/beam /opt/apache/beam
+
+# Set working directory
+WORKDIR ${SPARK_HOME}
+
+USER ${SPARK_UID}
@@ -0,0 +1,55 @@
+---
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: SparkApplication
+metadata:
+  name: beam-wc
+  namespace: spark-team-a
+spec:
+  type: Python
+  pythonVersion: "3"
+  # Beam runtime image
+  image: "$ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/beam-spark-repo:eks-beam-image"
+  imagePullPolicy: Always
+  mainClass:  org.apache.beam.runners.spark.SparkPipelineRunner
+  mainApplicationFile: "s3://$S3_BUCKET/app/wordcountApp.jar"
+  sparkConf:
+    spark.local.dir: "/data"
+    spark.speculation: "false"
+    spark.network.timeout: "2400"
+    spark.hadoop.fs.s3a.connection.timeout: "1200000"
+    spark.hadoop.fs.s3a.path.style.access: "true"
+    spark.hadoop.fs.s3a.connection.maximum: "200"
+    spark.hadoop.fs.s3a.fast.upload: "true"
+    spark.hadoop.fs.s3a.readahead.range: "256K"
+    spark.hadoop.fs.s3a.input.fadvise: "random"
+    spark.hadoop.fs.s3a.aws.credentials.provider.mapping: "com.amazonaws.auth.WebIdentityTokenCredentialsProvider=software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider"
+    spark.hadoop.fs.s3a.aws.credentials.provider: "software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider" # AWS SDK V2 https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/aws_sdk_upgrade.html"
+    spark.hadoop.fs.s3.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
+    spark.eventLog.enabled: "true"
+    spark.eventLog.dir: "s3://$S3_BUCKET/spark-event-logs/"
+    spark.app.name: "beam-wc"
+    spark.kubernetes.executor.podNamePrefix: "beam-spark"
+    spark.kubernetes.driver.pod.name: beam-spark-driver
+    # Required for EMR Runtime and Glue Catalogue
+    spark.sql.parquet.fs.optimized.committer.optimization-enabled: "true"
+    spark.executor.defaultJavaOptions: -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 -XX:OnOutOfMemoryError="kill -9 %p"
+    spark.driver.defaultJavaOptions:  -XX:OnOutOfMemoryError="kill -9 %p" -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70
+    spark.pyspark.python: /opt/venv/bin/python3
+    spark.pyspark.driver.python: /opt/venv/bin/python3
+  sparkVersion: "3.5.6"
+  restartPolicy:
+    type: Never
+  driver:
+    cores: 1
+    memory: "4g"
+    serviceAccount: spark-team-a
+    nodeSelector:
+      NodeGroupType: "SparkComputeOptimized"
+      karpenter.sh/capacity-type: "on-demand"
+  executor:
+    cores: 1
+    instances: 4
+    memory: "4g"
+    serviceAccount: spark-team-a
+    nodeSelector:
+      NodeGroupType: "SparkComputeOptimized"
@@ -0,0 +1,116 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""A word-counting workflow."""
+
+# pytype: skip-file
+
+# beam-playground:
+#   name: WordCount
+#   description: An example that counts words in Shakespeare's works.
+#   multifile: false
+#   pipeline_options: --output output.txt
+#   context_line: 87
+#   categories:
+#     - Combiners
+#     - Options
+#     - Quickstart
+#   complexity: MEDIUM
+#   tags:
+#     - options
+#     - count
+#     - combine
+#     - strings
+
+import argparse
+import logging
+import re
+
+import apache_beam as beam
+from apache_beam.io import ReadFromText
+from apache_beam.io import WriteToText
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+from apache_beam.runners.runner import PipelineResult
+
+
+class WordExtractingDoFn(beam.DoFn):
+  """Parse each line of input text into words."""
+  def process(self, element):
+    """Returns an iterator over the words of this element.
+
+    The element is a line of text.  If the line is blank, note that, too.
+
+    Args:
+      element: the element being processed
+
+    Returns:
+      The processed element.
+    """
+    return re.findall(r'[\w\']+', element, re.UNICODE)
+
+
+def run(argv=None, save_main_session=True) -> PipelineResult:
+  """Main entry point; defines and runs the wordcount pipeline."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input',
+      dest='input',
+      default='gs://dataflow-samples/shakespeare/kinglear.txt',
+      help='Input file to process.')
+  parser.add_argument(
+      '--output',
+      dest='output',
+      required=True,
+      help='Output file to write results to.')
+  known_args, pipeline_args = parser.parse_known_args(argv)
+
+  # We use the save_main_session option because one or more DoFn's in this
+  # workflow rely on global context (e.g., a module imported at module level).
+  pipeline_options = PipelineOptions(pipeline_args)
+  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
+
+  pipeline = beam.Pipeline(options=pipeline_options)
+
+  # Read the text file[pattern] into a PCollection.
+  lines = pipeline | 'Read' >> ReadFromText(known_args.input)
+
+  counts = (
+      lines
+      | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
+      | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
+      | 'GroupAndSum' >> beam.CombinePerKey(sum))
+
+  # Format the counts into a PCollection of strings.
+  def format_result(word, count):
+    return '%s: %d' % (word, count)
+
+  output = counts | 'Format' >> beam.MapTuple(format_result)
+
+  # Write the output using a "Write" transform that has side effects.
+  # pylint: disable=expression-not-assigned
+  output | 'Write' >> WriteToText(known_args.output)
+
+  # Execute the pipeline and return the result.
+  result = pipeline.run()
+  result.wait_until_finish()
+  return result
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  run()
@@ -47,7 +47,7 @@ coordinator:
   config:
     query:
       #maxMemoryPerNode + (maxHeapSize * 0.3) < maxHeapSize
-      maxMemoryPerNode: "22GB"  # ~70% of maxHeapSize
+      maxMemoryPerNode: "6GB"  # ~70% of maxHeapSize
       minWorkers: 1
       initialHashPartitions: 100
   resources:
@@ -60,7 +60,7 @@ coordinator:
   annotations:
     karpenter.sh/do-not-disrupt: "true"
   nodeSelector:
-    NodePool: trino-sql-karpenter
+    NodePool: trino-control-karpenter
     karpenter.sh/capacity-type: on-demand
   topologySpreadConstraints:
     - maxSkew: 1
@@ -72,7 +72,7 @@ coordinator:
 
 worker:
   jvm:
-    maxHeapSize: "89G"  # ~80% of container memory (110Gi)
+    maxHeapSize: "12G"  # ~80% of container memory (14Gi)
     extraArguments:
       - "-XX:+UseG1GC"
       - "-XX:G1HeapRegionSize=32M"
@@ -83,14 +83,14 @@ worker:
       - "-XX:+UseContainerSupport"
   config:
     query:
-      maxMemoryPerNode: "71GB"  # ~80% of maxHeapSize
+      maxMemoryPerNode: "6GB"  # ~80% of maxHeapSize
   resources:
     requests:
-      cpu: "12000m"  # Leave 3000m for system/DaemonSets
-      memory: 112Gi  # Leave 16Gi for system/DaemonSets
+      cpu: "3000m"
+      memory: 14Gi
     limits:
-      cpu: "14000m"
-      memory: 112Gi
+      cpu: "6000m"
+      memory: 14Gi
   nodeSelector:
     NodePool: trino-sql-karpenter
     karpenter.sh/capacity-type: on-demand
@@ -107,9 +107,9 @@ additionalConfigProperties:
   - "exchange.compression-enabled=true"
   - "query.remote-task.max-error-duration=1m"
   - "query.max-hash-partition-count=100"   # Updated from query.hash-partition-count
-  - "spill-enabled=true"                   # Updated from experimental.spill-enabled
-  - "spiller-spill-path=/tmp/spill"        # Chagne this to SSD mount for faster
-  - "memory.heap-headroom-per-node=9.6GB"
+  - "spill-enabled=false"                  # Updated from experimental.spill-enabled
+  - "spiller-spill-path=/tmp/spill"        # Change this to SSD mount for faster
+  - "memory.heap-headroom-per-node=1.6GB"
   - "optimizer.join-reordering-strategy=AUTOMATIC"  # Updated from join-reordering-strategy
   - "query.max-history=100"
   - "query.client.timeout=30m"
@@ -186,7 +186,7 @@ serviceMonitor:
   enabled: true
   labels:
     prometheus: kube-prometheus
-  interval: "15s"
+  interval: "5s"
   coordinator:
     enabled: true
     labels:
 
@@ -94,7 +94,7 @@ resource "kubectl_manifest" "karpenter_node_class" {
 }
 
 # Create a Karpenter NodePool using the AL2023 NodeClass
-resource "kubectl_manifest" "karpenter_node_pool" {
+resource "kubectl_manifest" "karpenter_worker_pool" {
   yaml_body = <<-YAML
     apiVersion: karpenter.sh/v1
     kind: NodePool
@@ -138,4 +138,51 @@ resource "kubectl_manifest" "karpenter_node_pool" {
   depends_on = [
     kubectl_manifest.karpenter_node_class
   ]
-}
+}
+
+# Create a Karpenter NodePool using the AL2023 NodeClass
+resource "kubectl_manifest" "karpenter_ctl_pool" {
+  yaml_body = <<-YAML
+    apiVersion: karpenter.sh/v1
+    kind: NodePool
+    metadata:
+      name: trino-control-karpenter
+    spec:
+      template:
+        metadata:
+          labels:
+            NodePool: trino-control-karpenter
+        spec:
+          nodeClassRef:
+            group: karpenter.k8s.aws
+            kind: EC2NodeClass
+            name: trino-karpenter
+          requirements:
+            - key: "karpenter.sh/capacity-type"
+              operator: In
+              values: ["on-demand"]
+            - key: "kubernetes.io/arch"
+              operator: In
+              values: ["arm64"]
+            - key: "karpenter.k8s.aws/instance-category"
+              operator: In
+              values: ["r"]
+            - key: "karpenter.k8s.aws/instance-family"
+              operator: In
+              values: ["r6g", "r7g", "r8g"]
+            - key: "karpenter.k8s.aws/instance-size"
+              operator: In
+              values: ["2xlarge", "4xlarge"]
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 60s
+      limits:
+        cpu: "128"
+        memory: 256Gi
+      weight: 10
+  YAML
+
+  depends_on = [
+    kubectl_manifest.karpenter_node_class
+  ]
+}
Original file line number	Diff line number	Diff line change
`@@ -959,7 +959,7 @@ resource "random_password" "grafana" {`
`959`	`959`
`960`	`960`	`#tfsec:ignore:aws-ssm-secret-use-customer-key`
`961`	`961`	`resource "aws_secretsmanager_secret" "grafana" {`
`962`		`- name = "${local.name}-grafana"`
	`962`	`+ name_prefix = "${local.name}-grafana-"`
`963`	`963`	`recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy`
`964`	`964`	`}`
`965`	`965`