Migrate Databricks account to Azure Databricks (#82)

vatsrahul1001 · web-flow · commit d49e88c01e7d · 2024-06-28T13:51:02.000+05:30
* update azure databricks details
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -172,7 +172,9 @@ jobs:
           path: ./.coverage
     env:
       DATABRICKS_CONN_TOKEN: ${{ secrets.DATABRICKS_CONN_TOKEN }}
-      DATABRICKS_CONN_HOST: https://dbc-9c390870-65ef.cloud.databricks.com/
+      DATABRICKS_CONN_HOST: {{ secrets.DATABRICKS_CONN_HOST }}
+      DATABRICKS_CONN: ${{ secrets.AIRFLOW_CONN_DATABRICKS_DEFAULT }}
+
 
   Code-Coverage:
     if: github.event.action != 'labeled'
diff --git a/dev/dags/basic_notebooks.py b/dev/dags/basic_notebooks.py
@@ -10,14 +10,7 @@
         "new_cluster": {
             "cluster_name": "",
             "spark_version": "11.3.x-scala2.12",
-            "aws_attributes": {
-                "first_on_demand": 1,
-                "availability": "SPOT_WITH_FALLBACK",
-                "zone_id": "us-east-2b",
-                "spot_bid_price_percent": 100,
-                "ebs_volume_count": 0,
-            },
-            "node_type_id": "i3.xlarge",
+            "node_type_id": "Standard_DS3_v2",
             "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
             "enable_elastic_disk": False,
             "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
diff --git a/dev/dags/common_operator.py b/dev/dags/common_operator.py
@@ -1,8 +1,8 @@
 from datetime import datetime
 
 from airflow.decorators import dag
-from astro_databricks.operators.notebook import DatabricksNotebookOperator
 from astro_databricks.operators.common import DatabricksTaskOperator
+from astro_databricks.operators.notebook import DatabricksNotebookOperator
 from astro_databricks.operators.workflow import DatabricksWorkflowTaskGroup
 
 job_clusters = [
@@ -11,14 +11,7 @@
         "new_cluster": {
             "cluster_name": "",
             "spark_version": "11.3.x-scala2.12",
-            "aws_attributes": {
-                "first_on_demand": 1,
-                "availability": "SPOT_WITH_FALLBACK",
-                "zone_id": "us-east-2b",
-                "spot_bid_price_percent": 100,
-                "ebs_volume_count": 0,
-            },
-            "node_type_id": "i3.xlarge",
+            "node_type_id": "Standard_DS3_v2",
             "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
             "enable_elastic_disk": False,
             "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
diff --git a/dev/dags/task_group_example.py b/dev/dags/task_group_example.py
@@ -10,14 +10,7 @@
         "new_cluster": {
             "cluster_name": "",
             "spark_version": "11.3.x-scala2.12",
-            "aws_attributes": {
-                "first_on_demand": 1,
-                "availability": "SPOT_WITH_FALLBACK",
-                "zone_id": "us-east-2b",
-                "spot_bid_price_percent": 100,
-                "ebs_volume_count": 0,
-            },
-            "node_type_id": "i3.xlarge",
+            "node_type_id": "Standard_DS3_v2",
             "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
             "enable_elastic_disk": False,
             "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
diff --git a/example_dags/example_databricks_notebook.py b/example_dags/example_databricks_notebook.py
@@ -18,14 +18,7 @@
 NEW_CLUSTER_SPEC = {
     "cluster_name": "",
     "spark_version": "11.3.x-scala2.12",
-    "aws_attributes": {
-        "first_on_demand": 1,
-        "availability": "SPOT_WITH_FALLBACK",
-        "zone_id": "us-east-2b",
-        "spot_bid_price_percent": 100,
-        "ebs_volume_count": 0,
-    },
-    "node_type_id": "i3.xlarge",
+    "node_type_id": "Standard_DS3_v2",
     "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
     "enable_elastic_disk": False,
     "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
diff --git a/example_dags/example_databricks_workflow.py b/example_dags/example_databricks_workflow.py
@@ -15,11 +15,12 @@
 }
 
 DATABRICKS_CONN_ID = os.getenv("ASTRO_DATABRICKS_CONN_ID", "databricks_conn")
-DATABRICKS_NOTIFICATION_EMAIL = os.getenv(
-    "ASTRO_DATABRICKS_NOTIFICATION_EMAIL", "tatiana.alchueyr@astronomer.io"
-)
+
+# DATABRICKS_NOTIFICATION_EMAIL = os.getenv(
+#     "ASTRO_DATABRICKS_NOTIFICATION_EMAIL", "tatiana.alchueyr@astronomer.io"
+# )
 DATABRICKS_DESTINATION_ID = os.getenv(
-    "ASTRO_DATABRICKS_DESTINATION_ID", "b0aea8ab-ea8c-4a45-a2e9-9a26753fd702"
+    "ASTRO_DATABRICKS_DESTINATION_ID", "48c7315c-1d65-4ee3-b7d3-1692e8e8012d"
 )
 
 USER = os.environ.get("USER")
@@ -32,14 +33,7 @@
         "new_cluster": {
             "cluster_name": "",
             "spark_version": "11.3.x-scala2.12",
-            "aws_attributes": {
-                "first_on_demand": 1,
-                "availability": "SPOT_WITH_FALLBACK",
-                "zone_id": "us-east-2b",
-                "spot_bid_price_percent": 100,
-                "ebs_volume_count": 0,
-            },
-            "node_type_id": "i3.xlarge",
+            "node_type_id": "Standard_DS3_v2",
             "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
             "enable_elastic_disk": False,
             "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
@@ -72,9 +66,10 @@
             },
         ],
         extra_job_params={
-            "email_notifications": {
-                "on_start": [DATABRICKS_NOTIFICATION_EMAIL],
-            },
+            ## Commented below to avoid spam; keeping this for example purposes.
+            # "email_notifications": {
+            #     "on_start": [DATABRICKS_NOTIFICATION_EMAIL],
+            # },
             "webhook_notifications": {
                 "on_start": [{"id": DATABRICKS_DESTINATION_ID}],
             },
diff --git a/example_dags/example_task_group.py b/example_dags/example_task_group.py
@@ -1,13 +1,11 @@
 import os
 from datetime import datetime
 
-from airflow.decorators import dag, task_group
+from airflow.decorators import dag
 from airflow.utils.task_group import TaskGroup
-
 from astro_databricks.operators.notebook import DatabricksNotebookOperator
 from astro_databricks.operators.workflow import DatabricksWorkflowTaskGroup
 
-
 DATABRICKS_CONN = "databricks_conn"
 USER = os.environ.get("USER")
 GROUP_ID = os.getenv("DATABRICKS_GROUP_ID", "1234").replace(".", "_")
@@ -18,14 +16,7 @@
         "new_cluster": {
             "cluster_name": "",
             "spark_version": "11.3.x-scala2.12",
-            "aws_attributes": {
-                "first_on_demand": 1,
-                "availability": "SPOT_WITH_FALLBACK",
-                "zone_id": "us-east-2b",
-                "spot_bid_price_percent": 100,
-                "ebs_volume_count": 0,
-            },
-            "node_type_id": "i3.xlarge",
+            "node_type_id": "Standard_DS3_v2",
             "spark_env_vars": {"PYSPARK_PYTHON": "/databricks/python3/bin/python3"},
             "enable_elastic_disk": False,
             "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
@@ -40,7 +31,9 @@
     schedule_interval="@daily",
     start_date=datetime(2021, 1, 1),
     catchup=False,
-    default_args={'retries': 0},  # Users are encouraged to use the repair feature, retries may fail
+    default_args={
+        "retries": 0
+    },  # Users are encouraged to use the repair feature, retries may fail
     tags=["astro-provider-databricks"],
 )
 def example_task_group():