aws-samples · spatel-gfb · May 28, 2020
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,12 @@
 !jest.config.js
 *.d.ts
 node_modules
+*.swp
+package-lock.json
+__pycache__
+.pytest_cache
+.env
+*.egg-info
 
 # CDK asset staging directory
 .cdk.staging

diff --git a/DeployChemAxonCompRegEnv.sh b/DeployChemAxonCompRegEnv.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -e -x
+
+# Browse through the required directory
+cd chem-axon-setup
+
+# Read Arguments
+export EnvironVarLower=$1
+export AwsProfile=$2
+
+# Install packages
+pip3 install -r requirements.txt
+
+## Initialise Variables
+source configs/deploy_config.env ${EnvironVarLower}
+
+## Touch credentials file as cdk module has a dependency on it.
+## Its empty in this case and is only required for cdk module to work.
+touch ~/.aws/credentials
+
+# Function to build compound reg docker image and push it to AWS ECR Repo
+build_and_push_comp_reg_image(){
+
+    # Build the docker image locally
+    docker build --rm -t ${AccountId}.dkr.ecr.${AwsRegion}.amazonaws.com/${WorkflowEcrRepository}:${WorkflowCompRegImage} \
+      --build-arg="PYTHON_VERSION=3.7" \
+      --build-arg="UBUNTU_VERSION=18.04" \
+      --build-arg="ORACLE_VERSION=12.2.0.1.0" \
+      --build-arg="ORACLE_ZIP_INTERNAL_FOLDER=instantclient_12_2" compound_reg_pipeline/
+
+    # Push the image on AWS ECR
+    docker push ${AccountId}.dkr.ecr.${AwsRegion}.amazonaws.com/${WorkflowEcrRepository}:${WorkflowCompRegImage}
+
+}
+
+# Check for profile and if not passed, use default
+if [[ ${AwsProfile} == '' ]]
+then
+    #Get Account ID
+    aws configure set region ${AwsRegion}
+    export AccountId=$(aws sts get-caller-identity --output text --query 'Account')
+
+    #ECR Login for pushing Docker Image
+    $(aws ecr get-login --no-include-email --region ${AwsRegion})
+
+    cdk deploy gfb-datalake-batch-stack --require-approval never
+    cdk deploy gfb-datalake-batch-job-stack --require-approval never
+    build_and_push_comp_reg_image
+    cdk deploy gfb-datalake-secret-manager-stack --require-approval never
+    cdk deploy gfb-datalake-lambda-stack --require-approval never
+    cdk deploy gfb-datalake-glue-stack --require-approval never
+
+else
+    #Get Account ID
+    aws configure set region ${AwsRegion} --profile ${AwsProfile}
+    export AccountId=$(aws sts get-caller-identity --output text --query 'Account' --profile ${AwsProfile})
+
+    #ECR Login for pushing Docker Image
+    $(aws ecr get-login --no-include-email --region ${AwsRegion} --profile ${AwsProfile})
+
+    cdk deploy gfb-datalake-batch-stack --require-approval never --profile ${AwsProfile}
+    cdk deploy gfb-datalake-batch-job-stack --require-approval never --profile ${AwsProfile}
+    build_and_push_comp_reg_image
+    cdk deploy gfb-datalake-secret-manager-stack --require-approval never --profile ${AwsProfile}
+    cdk deploy gfb-datalake-lambda-stack --require-approval never --profile ${AwsProfile}
+    cdk deploy gfb-datalake-glue-stack --require-approval never --profile ${AwsProfile}
+
+fi
diff --git a/chem-axon-setup/README.md b/chem-axon-setup/README.md
@@ -0,0 +1,62 @@
+<h1 id='HPG9CA7YL2o'>ChemAxon Compound Registration DB to AWS Data Lake</h1>
+
+The folder/repo holds the codebase for creating AWS Infrastructure and ETL job for loading data from existing Compound
+Registration Database to S3 in a scheduled way.<br/>
+
+### Pre-requisites
+
+- AWS RDS Instance which holds Compound Registration Data.
+
+- AWS Details like VPC, Subnet, AZ's, RDS Connection Details to be filled in configs/deploy_config.env file for each env.
+
+- Python and CDK installed.
+
+### Steps to be executed
+
+- Download the codebase locally.
+- Ensure the AWS profiles are set for use.
+- Fill the details in configs/deploy_config.env
+- Start the deployment by running :
+
+  ```bash
+  # env is the same variable used in configs/deploy_config.env in lower case.
+  # aws_profile is the profile to be used. If no profile is manually set, provide default.
+  sh DeployChemAxonCompRegEnv.sh <env> <aws_profile>
+  ```
+
+### What will be setup?
+
+As soon as the deploy.sh is executed, it will gather the variables from configs/deploy_config.env as per Env passed and make those variables available on the command line. It will then start creating below objects as per order mentioned.
+
+- AWS Batch Infrastructure : It will create a Compute Environment and Job Queue along with the EC2 Security group and IAM roles and policies required.
+
+- AWS ECR and Batch Job : It will create ECR Repository and AWS Batch Job  Definition.
+
+- Docker Image : It will create a Docker Image from compound_reg_pipeline folder as per Dockerfile. It is currently considering the Comp-Reg RDS to be Oracle and hence installing dependencies for the ETL code. The actual ETL job is comp_reg_data_load.py which will be invoked as per the required/mentioned frequency.
+
+  This image is then pushed to ECR Repository.
+
+- AWS Secret Manager : A secret with Comp Reg RDS Credentials is created. This secret key is then used in the ETL.
+
+- AWS S3 Bucket and Lambda : A s3 bucket is created which will be used for data loading. Along with it a Lambda Function which will be used to trigger the ETL is also created. This lambda function details could be found from chem-axon-setup/lambdas/trigger_compound_reg_pipeline.py
+
+  Currently it is using S3 event trigger but that can be changed to any other trigger of choice as well.
+
+- AWS S3 Bucket and Lambda : A s3 bucket is created which will be used for data loading. Along with it a Lambda Function which will be used to trigger the ETL is also created. This lambda function details could be found from chem-axon-setup/lambdas/trigger_compound_reg_pipeline.py 
+
+  Currently it is using S3 event trigger but that can be changed to any other trigger of choice as well.
+
+- AWS Glue : A glue database and table on top of the S3 bucket data is created for querying through Athena.
+
+
+### ETL Process
+
+The script chem-axon-setup/compound_reg_pipeline/comp_reg_data_load.py when triggered follows the below steps :
+
+- Queries the latest data from Comp Reg DB and creates a pandas dataframe.
+- Brings all the already loaded data from S3 and creates another pandas dataframe.
+- Compares the 2 DF's using hash's and creates a new DF with only new or updated records.
+- Loads this new DF into a new S3 partition with date in parquet format.
+  (The partition can be user specific and depends on the frequency of execution.)
+- If no new data is detected, it will just exit.
+- The logs are available in cloudwatch and can be found in AWS Batch -> Jobs dashboard -> Specific Job ID Details.
diff --git a/chem-axon-setup/__init__.py b/chem-axon-setup/__init__.py
diff --git a/chem-axon-setup/app.py b/chem-axon-setup/app.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+from aws_cdk import core
+import os
+from data_lake_setup.datalake_batch_stack import DatalakeBatchStack
+from data_lake_setup.datalake_batch_job_def_stack import DatalakeBatchJobStack
+from data_lake_setup.datalake_secret_manager_stack import DatalakeSecretManagerStack
+from data_lake_setup.datalake_lambda_stack import DatalakeLambdaStack
+from data_lake_setup.datalake_glue_stack import DatalakeGlueStack
+
+""" Define your account id to make import vpc work """
+env_cn = core.Environment(account=os.environ.get("AccountId"), region=os.environ.get("AwsRegion"))
+
+""" Initialising environment variables and creating a dictionary to pass"""
+config_dict = {}
+config_dict['env_var'] = os.environ.get("EnvironVarLower")
+config_dict['vpc_id'] = os.environ.get("VpcId")
+config_dict['SubnetIds'] = os.environ.get("SubnetIds")
+config_dict['AvailabilityZones'] = os.environ.get("AvailabilityZones")
+config_dict['workflow_ecr_repo'] = os.environ.get("WorkflowEcrRepository")
+config_dict['datalake_bucket_name'] = "datalake-" + config_dict['env_var'].lower()
+config_dict['datalake_db_name'] = "datalake_db"
+config_dict['workflow_comp_reg_image_version'] = os.environ.get("WorkflowCompRegImage")
+config_dict['comp_reg_secret_name'] = os.environ.get("CompRegSecretName")
+config_dict['comp_reg_host_name'] = os.environ.get("CompRegHostName")
+config_dict['comp_reg_port'] = os.environ.get("CompRegPort")
+config_dict['comp_reg_db_name'] = os.environ.get("CompRegDBName")
+config_dict['comp_reg_user_name'] = os.environ.get("CompRegUserName")
+config_dict['comp_reg_password'] = os.environ.get("CompRegPassword")
+
+""" Sample config_dict would look like below :
+config_dict = {
+	'env_var': 'prod',
+	'vpc_id': 'vpc-01234567',
+	'SubnetIds': 'subnet-01234567,subnet-0123456789',
+	'AvailabilityZones': 'us-east-1a,us-east-1b',
+	'workflow_ecr_repo': 'datalake-repo',
+	'datalake_bucket_name': 'datalake-prod',
+	'datalake_db_name': 'datalake_db',
+	'workflow_comp_reg_image_version': 'comp-reg-1.0',
+	'comp_reg_secret_name': 'CompRegConn',
+	'comp_reg_host_name': 'db_endpoint_host_name',
+	'comp_reg_port': 'db_port',
+	'comp_reg_db_name': 'db_name',
+	'comp_reg_user_name': 'db_user',
+	'comp_reg_password': 'db_pass'
+}
+"""
+
+""" Start execution of deployment """
+app = core.App()
+DatalakeBatchStack(app, "datalake-batch-stack", config_dict, env=env_cn)
+DatalakeBatchJobStack(app, "datalake-batch-job-stack", config_dict, env=env_cn)
+DatalakeSecretManagerStack(app, "datalake-secret-manager-stack", config_dict, env=env_cn)
+DatalakeLambdaStack(app, "datalake-lambda-stack", config_dict, env=env_cn)
+DatalakeGlueStack(app, "datalake-glue-stack", config_dict, env=env_cn)
+
+app.synth()
diff --git a/chem-axon-setup/cdk.json b/chem-axon-setup/cdk.json
@@ -0,0 +1,3 @@
+{
+  "app": "python3 app.py"
+}
diff --git a/chem-axon-setup/compound_reg_pipeline/Dockerfile b/chem-axon-setup/compound_reg_pipeline/Dockerfile
@@ -0,0 +1,41 @@
+ARG PYTHON_VERSION
+ARG UBUNTU_VERSION
+FROM ubuntu:${UBUNTU_VERSION} AS client
+ARG ORACLE_VERSION
+ARG ORACLE_ZIP_INTERNAL_FOLDER
+WORKDIR /root
+ENV CLIENT_ZIP=instantclient-basiclite-linux.x64-${ORACLE_VERSION}.zip
+ENV SDK_ZIP=instantclient-sdk-linux.x64-${ORACLE_VERSION}.zip
+
+RUN apt-get update && apt-get -yq install unzip
+COPY ${CLIENT_ZIP} .
+COPY ${SDK_ZIP} .
+RUN unzip ${CLIENT_ZIP}
+RUN unzip ${SDK_ZIP}
+RUN mv ${ORACLE_ZIP_INTERNAL_FOLDER} oracle
+
+FROM python:${PYTHON_VERSION}
+LABEL maintainer=spate@goldficnhbio.com
+ARG ORACLE_VERSION
+ENV HOME /root
+ENV ORACLE_HOME /opt/oracle
+ENV TNS_ADMIN ${ORACLE_HOME}/network/admin
+VOLUME ["${TNS_ADMIN}"]
+
+COPY --from=client /root/oracle ${ORACLE_HOME}
+RUN apt-get update \
+	&& apt-get -yq install libaio1 \
+	&& apt-get -yq install vim \
+	&& apt-get -yq autoremove \
+	&& apt-get clean \
+	# Install Oracle Instant Client
+	&& echo ${ORACLE_HOME} > /etc/ld.so.conf.d/oracle.conf \
+	&& mkdir -p ${TNS_ADMIN} \
+	&& ldconfig \
+	&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+RUN pip install numpy pandas boto3 s3fs fastparquet mypy_extensions psutil awscli toolz dask cx_Oracle
+
+RUN mkdir /scripts
+WORKDIR /scripts
+ADD comp_reg_data_load.py /scripts