Skip to content

[Break Glass] Change CW agent and Fluent Bit to Cloudwatch Observability #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pr-triage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
permissions:
pull-requests: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Get PR author
id: get-author
run: |
Expand All @@ -30,7 +30,7 @@ jobs:
- name: Auto-approve if author is able to write and contains only doc change
id: doc-change
if: steps.author-permission.outputs.require-result == 'true'
uses: actions/github-script@v6
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
Expand Down
3 changes: 2 additions & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
CODEOWNERS @aws-solutions-library-samples/maintainers
* @yubingjiaocn @bnusunny @github-actions
CODEOWNERS @aws-solutions-library-samples/maintainers @yubingjiaocn
/.github/workflows/maintainer_workflows.yml @aws-solutions-library-samples/maintainers
/.github/solutionid_validator.sh @aws-solutions-library-samples/maintainers
5 changes: 3 additions & 2 deletions lib/addons/s3CSIDriver.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as blueprints from '@aws-quickstart/eks-blueprints';
import { ManagedPolicy } from 'aws-cdk-lib/aws-iam';
import * as eks from 'aws-cdk-lib/aws-eks';
import * as iam from 'aws-cdk-lib/aws-iam';
import { Construct } from "constructs";

Expand All @@ -12,7 +12,7 @@ export const defaultProps: blueprints.addons.HelmAddOnProps & s3CSIDriverAddOnPr
name: 's3CSIDriverAddOn',
namespace: 'kube-system',
release: 's3-csi-driver-release',
version: 'v1.7.0',
version: 'v1.10.0',
repository: 'https://awslabs.github.io/mountpoint-s3-csi-driver',
s3BucketArn: ''
}
Expand All @@ -31,6 +31,7 @@ export class s3CSIDriverAddOn extends blueprints.addons.HelmAddOn {
const serviceAccount = cluster.addServiceAccount('s3-csi-driver-sa', {
name: 's3-csi-driver-sa',
namespace: this.options.namespace,
identityType: eks.IdentityType.POD_IDENTITY
});

// new IAM policy to grand access to S3 bucket
Expand Down
78 changes: 30 additions & 48 deletions lib/dataPlane.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,49 +48,33 @@ export default class DataPlaneStack {
irsaRoles: ["CloudWatchFullAccess", "AmazonSQSFullAccess"]
};

const CloudWatchLogsWritePolicy = new iam.PolicyStatement({
actions: [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:DescribeLogStreams",
"logs:PutLogEvents",
"logs:GetLogEvents"
],
resources: ["*"],
})

const awsForFluentBitParams: blueprints.AwsForFluentBitAddOnProps = {
iamPolicies: [CloudWatchLogsWritePolicy],
namespace: "amazon-cloudwatch",
values: {
cloudWatchLogs: {
region: cdk.Aws.REGION,
logRetentionDays: 7
},
tolerations: [{
"operator": "Exists",
"effect": "NoSchedule"
}]
},
createNamespace: true
}

const containerInsightsParams: blueprints.ContainerInsightAddonProps = {
values: {
adotCollector: {
daemonSet: {
tolerations: [{
"operator": "Exists",
"effect": "NoSchedule"
}],
cwreceivers: {
preferFullPodName: "true",
addFullPodNameMetricLabel: "true"
const cloudWatchInsightsParams: blueprints.CloudWatchInsightsAddOnProps = {
configurationValues: {
tolerations: [
{
key: "runtime",
operator: "Exists",
effect: "NoSchedule"
},
{
key: "nvidia.com/gpu",
operator: "Exists",
effect: "NoSchedule"
}
],
containerLogs: {
enabled: true,
fluentBit: {
config: {
service: "[SERVICE]\n Flush 5\n Grace 30\n Log_Level info",
extraFiles: {
"application-log.conf": "[INPUT]\n Name tail\n Tag kube.*\n Path /var/log/containers/*.log\n Parser docker\n DB /var/log/flb_kube.db\n Mem_Buf_Limit 5MB\n Skip_Long_Lines On\n Refresh_Interval 10\n\n[FILTER]\n Name kubernetes\n Match kube.*\n Kube_URL https://kubernetes.default.svc:443\n Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt\n Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token\n Kube_Tag_Prefix kube.var.log.containers.\n Merge_Log On\n Merge_Log_Key log_processed\n K8S-Logging.Parser On\n K8S-Logging.Exclude On\n\n[FILTER]\n Name grep\n Match kube.*\n Exclude $kubernetes['namespace_name'] kube-system\n\n[OUTPUT]\n Name cloudwatch\n Match kube.*\n region ${AWS_REGION}\n log_group_name /aws/containerinsights/${CLUSTER_NAME}/application\n log_stream_prefix ${HOST_NAME}-\n auto_create_group true\n retention_in_days 7"
}
}
}
}
}
}
};

const SharedComponentAddOnParams: SharedComponentAddOnProps = {
inputSns: blueprints.getNamedResource("inputSNSTopic"),
Expand All @@ -116,12 +100,10 @@ export default class DataPlaneStack {
new blueprints.addons.AwsLoadBalancerControllerAddOn(),
new blueprints.addons.KarpenterAddOn({ interruptionHandling: true }),
new blueprints.addons.KedaAddOn(kedaParams),
new blueprints.addons.ContainerInsightsAddOn(containerInsightsParams),
new blueprints.addons.AwsForFluentBitAddOn(awsForFluentBitParams),
new blueprints.addons.CloudWatchInsights(cloudWatchInsightsParams),
new s3CSIDriverAddOn(s3CSIDriverAddOnParams),
new SharedComponentAddOn(SharedComponentAddOnParams),
new EbsThroughputTunerAddOn(EbsThroughputModifyAddOnParams),
new dcgmExporterAddOn({})
];

// Generate SD Runtime Addon for runtime
Expand Down Expand Up @@ -160,9 +142,9 @@ const MngProps: blueprints.MngClusterProviderProps = {
minSize: 2,
maxSize: 2,
desiredSize: 2,
version: eks.KubernetesVersion.V1_29,
instanceTypes: [new ec2.InstanceType('m5.large')],
amiType: eks.NodegroupAmiType.AL2_X86_64,
version: eks.KubernetesVersion.V1_31,
instanceTypes: [new ec2.InstanceType('m7g.large')],
amiType: eks.NodegroupAmiType.AL2023_ARM_64_STANDARD,
enableSsmPermissions: true,
nodeGroupTags: {
"Name": cdk.Aws.STACK_NAME + "-ClusterComponents",
Expand All @@ -172,7 +154,7 @@ const MngProps: blueprints.MngClusterProviderProps = {

// Deploy EKS cluster with all add-ons
const blueprint = blueprints.EksBlueprint.builder()
.version(eks.KubernetesVersion.V1_29)
.version(eks.KubernetesVersion.V1_31)
.addOns(...addOns)
.resourceProvider(
blueprints.GlobalResources.Vpc,
Expand All @@ -185,7 +167,7 @@ const blueprint = blueprints.EksBlueprint.builder()
.resourceProvider("s3GWEndpoint", new s3GWEndpointProvider("s3GWEndpoint"))
.clusterProvider(new blueprints.MngClusterProvider(MngProps))
.build(scope, id + 'Stack', props);

/*
// Workaround for permission denied when creating cluster
const handler = blueprint.node.tryFindChild('@aws-cdk--aws-eks.KubectlProvider')!
.node.tryFindChild('Handler')! as cdk.aws_lambda.Function
Expand All @@ -202,7 +184,7 @@ const blueprint = blueprints.EksBlueprint.builder()
actions: ["lambda:GetFunctionConfiguration"],
resources: [handler.functionArn]
}))

*/
// Provide static output name for cluster
const cluster = blueprint.getClusterInfo().cluster
const clusterNameCfnOutput = cluster.node.findChild('ClusterName') as cdk.CfnOutput;
Expand Down
105 changes: 105 additions & 0 deletions lib/resourceProvider/vpc.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { Tags } from 'aws-cdk-lib';
import * as ec2 from 'aws-cdk-lib/aws-ec2';
import { ISubnet, PrivateSubnet } from 'aws-cdk-lib/aws-ec2';
import * as blueprints from '@aws-quickstart/eks-blueprints';

/**
* Interface for Mapping for fields such as Primary CIDR, Secondary CIDR, Secondary Subnet CIDR.
*/
interface VpcProps {
primaryCidr?: string,
secondaryCidr?: string,
secondarySubnetCidrs?: string[]
}

/**
* VPC resource provider
*/
export class VpcProvider implements blueprints.ResourceProvider<ec2.IVpc> {
readonly vpcId?: string;
readonly primaryCidr?: string;
readonly secondaryCidr?: string;
readonly secondarySubnetCidrs?: string[];

constructor(vpcId?: string, private vpcProps?: VpcProps) {
this.vpcId = vpcId;
this.primaryCidr = vpcProps?.primaryCidr;
this.secondaryCidr = vpcProps?.secondaryCidr;
this.secondarySubnetCidrs = vpcProps?.secondarySubnetCidrs;
}

provide(context: blueprints.ResourceContext): ec2.IVpc {
const id = context.scope.node.id;

let vpc = getVPCFromId(context, id, this.vpcId);
if (vpc == null) {
// It will automatically divide the provided VPC CIDR range, and create public and private subnets per Availability Zone.
// If VPC CIDR range is not provided, uses `10.0.0.0/16` as the range and creates public and private subnets per Availability Zone.
// Network routing for the public subnets will be configured to allow outbound access directly via an Internet Gateway.
// Network routing for the private subnets will be configured to allow outbound access via a set of resilient NAT Gateways (one per AZ).
// Creates Secondary CIDR and Secondary subnets if passed.
if (this.primaryCidr) {
vpc = new ec2.Vpc(context.scope, id + "-vpc",{
ipAddresses: ec2.IpAddresses.cidr(this.primaryCidr)
});
}
else {
vpc = new ec2.Vpc(context.scope, id + "-vpc");
}
}


if (this.secondaryCidr) {
this.createSecondarySubnets(context, id, vpc);
}

return vpc;
}

protected createSecondarySubnets(context: blueprints.ResourceContext, id: string, vpc: ec2.IVpc) {
const secondarySubnets: Array<PrivateSubnet> = [];
const secondaryCidr = new ec2.CfnVPCCidrBlock(context.scope, id + "-secondaryCidr", {
vpcId: vpc.vpcId,
cidrBlock: this.secondaryCidr
});
secondaryCidr.node.addDependency(vpc);
if (this.secondarySubnetCidrs) {
for (let i = 0; i < vpc.availabilityZones.length; i++) {
if (this.secondarySubnetCidrs[i]) {
secondarySubnets[i] = new ec2.PrivateSubnet(context.scope, id + "private-subnet-" + i, {
availabilityZone: vpc.availabilityZones[i],
cidrBlock: this.secondarySubnetCidrs[i],
vpcId: vpc.vpcId
});
secondarySubnets[i].node.addDependency(secondaryCidr);
context.add("secondary-cidr-subnet-" + i, {
provide(_context): ISubnet { return secondarySubnets[i]; }
});
}
}
for (let secondarySubnet of secondarySubnets) {
Tags.of(secondarySubnet).add("kubernetes.io/role/internal-elb", "1", { applyToLaunchedInstances: true });
Tags.of(secondarySubnet).add("Name", `blueprint-construct-dev-PrivateSubnet-${secondarySubnet}`, { applyToLaunchedInstances: true });
}
}
}
}



/*
** This function will give return vpc based on the ResourceContext and vpcId passed to the cluster.
*/
export function getVPCFromId(context: blueprints.ResourceContext, nodeId: string, vpcId?: string) {
let vpc = undefined;
if (vpcId) {
if (vpcId === "default") {
console.log(`looking up completely default VPC`);
vpc = ec2.Vpc.fromLookup(context.scope, nodeId + "-vpc", { isDefault: true });
} else {
console.log(`looking up non-default ${vpcId} VPC`);
vpc = ec2.Vpc.fromLookup(context.scope, nodeId + "-vpc", { vpcId: vpcId });
}
}
return vpc;
}
Loading