diff --git a/HiveOperator/README.md b/HiveOperator/README.md index 0f22141..d2d8d83 100644 --- a/HiveOperator/README.md +++ b/HiveOperator/README.md @@ -2,7 +2,12 @@ HiveOperator [(Source code)](https://github.com/SAP/datahub-integration-examples ------------ This operator provides functionality to query a Hive Metastore server using a HiveQL string and returns a response in the format of a delimited string. -The operator runs on a custom Docker images that extends the SAP-deliver docker image `com.sap.python2.7` and uses the Kerberos client binary `krb5-user` as well as `libsasl2` for Ubuntu. The PyHive python module is developed and maintained by Dropbox: https://github.com/dropbox/PyHive +Two implementations are provided for this operator based on different versions of python. +- For python2, it runs on a custom Docker images that extends the SAP-deliver docker image `com.sap.python2.7`. +- For python3, it runs on a custom Docker images that extends the python3.6 docker image. +> Since SAP Data Intelligence version3.0, it only supports python3. + +Both use the Kerberos client binary `krb5-user` as well as `libsasl2` for Ubuntu. The PyHive python module is developed and maintained by Dropbox: https://github.com/dropbox/PyHive ![alt text](./graph.jpg "Graph") @@ -25,7 +30,7 @@ Before you start using the example, please make sure that: - Install Kerberos client libraries **2. Custom operator 'HiveOperator'** - - Derived from Pythin20Operator + - Derived from Python20Operator - Uses image tags `python27:""` and `pyhive:pip2` - **input port `inSql` of type string:** expects a single HiveQL-compliant string without a semicolon - **output port `output` of type string:** outputs the response from the Hive Metastore server, columns are delimited by a comma (default) but can be overriden using the `delimiter` configuration parameter (See description below) @@ -33,24 +38,27 @@ Before you start using the example, please make sure that: **3. Sample graph HiveOperator_test** - Provides an interactive terminal to query a Hive Metastore server and display the results. Note, the HiveOperator can only process one HiveQL statement at a time. +> Python3 implementation uses the same steps only diff in the base image and operator tags. + ## How to run - - Import [solution/HiveOperator-1.0.tgz](solution/HiveOperator-1.0.tgz) via `SAP Data Hub System Management` -> `Files` -> `Import Solution` + - For python2, import [solution/py2/HiveOperator-1.0.1.tar.gz](solution/py2/HiveOperator-1.0.1.tar.gz) via `SAP Data Hub System Management` -> `Files` -> `Import Solution` + - For python3, import [solution/py3/HiveOperator-1.1.0.tar.gz](solution/py3/HiveOperator-1.1.0.tar.gz) via `SAP Data Hub System Management` -> `Files` -> `Import Solution` - Run the `Graph` -> `examples.HiveOperator_test` **Operator configuration parameters** - database: Specify which database in Hive metastore to connect to - delimiter: Used to separate columns in HiveOperator output e.g. 1.34;Hello;World; + database: Specify which database in Hive metastore to connect to + delimiter: Used to separate columns in HiveOperator output e.g. 1.34;Hello;World; hive_hostname: Hostname or IP address to Hive Metastore server - hive_port: Port used by Hive Metastore server - http_mode: If hive.server2.transport.mode is set to http, set this parameter to true + hive_port: Port used by Hive Metastore server + http_mode: If hive.server2.transport.mode is set to http, set this parameter to true kerberos_enabled: If Hive cluster is kerberized set to true and read additional notes below kerberos_keytab_filename: The file name of the uploaded keytab file (case sensetive) kerberos_principal: Kerberos principal used with uploaded keytab file kerberos_realm: Kerberos realm used with principal and keytab file - username: Username for plain authentication - password: Password for plain authentication + username: Username for plain authentication + password: Password for plain authentication **Kerberos configuration** (Optional) Upload .keytab and krb5.conf file via the HiveOperator designer. These will be copied into the docker container at runtime. Remember to specify the kerberos realm and principal name in the operator's configuration section when designing your graph. diff --git a/HiveOperator/solution/HiveOperator-1.0.1.tar.gz b/HiveOperator/solution/py2/HiveOperator-1.0.1.tar.gz similarity index 100% rename from HiveOperator/solution/HiveOperator-1.0.1.tar.gz rename to HiveOperator/solution/py2/HiveOperator-1.0.1.tar.gz diff --git a/HiveOperator/solution/HiveOperator-1.0.tgz b/HiveOperator/solution/py2/HiveOperator-1.0.tgz similarity index 100% rename from HiveOperator/solution/HiveOperator-1.0.tgz rename to HiveOperator/solution/py2/HiveOperator-1.0.tgz diff --git a/HiveOperator/solution/py3/HiveOperator-1.1.0.tar.gz b/HiveOperator/solution/py3/HiveOperator-1.1.0.tar.gz new file mode 100644 index 0000000..efa9272 Binary files /dev/null and b/HiveOperator/solution/py3/HiveOperator-1.1.0.tar.gz differ diff --git a/HiveOperator/src/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile b/HiveOperator/src/py2/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile similarity index 100% rename from HiveOperator/src/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile rename to HiveOperator/src/py2/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile diff --git a/HiveOperator/src/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json b/HiveOperator/src/py2/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json similarity index 100% rename from HiveOperator/src/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json rename to HiveOperator/src/py2/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json diff --git a/HiveOperator/src/vrep/vflow/graphs/HiveOperator_test/graph.json b/HiveOperator/src/py2/vrep/vflow/graphs/HiveOperator_test/graph.json similarity index 100% rename from HiveOperator/src/vrep/vflow/graphs/HiveOperator_test/graph.json rename to HiveOperator/src/py2/vrep/vflow/graphs/HiveOperator_test/graph.json diff --git a/HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/Apache_Hive_logo.svg b/HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/Apache_Hive_logo.svg similarity index 100% rename from HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/Apache_Hive_logo.svg rename to HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/Apache_Hive_logo.svg diff --git a/HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/configSchema.json b/HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/configSchema.json similarity index 100% rename from HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/configSchema.json rename to HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/configSchema.json diff --git a/HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/operator.json b/HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/operator.json similarity index 100% rename from HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/operator.json rename to HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/operator.json diff --git a/HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/script.py b/HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/script.py similarity index 100% rename from HiveOperator/src/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/script.py rename to HiveOperator/src/py2/vrep/vflow/subengines/com/sap/python27/operators/examples/HiveOperator/script.py diff --git a/HiveOperator/src/vsolution.json b/HiveOperator/src/py2/vsolution.json old mode 100755 new mode 100644 similarity index 100% rename from HiveOperator/src/vsolution.json rename to HiveOperator/src/py2/vsolution.json diff --git a/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile b/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile new file mode 100644 index 0000000..43d88d4 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.6.4-slim-stretch + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt install -y python3-pip && \ + apt-get install -y python3-dev && \ + apt-get install -y krb5-user && \ + apt-get install -y libsasl2-dev && \ + apt-get install -y libsasl2-modules-gssapi-mit && \ + mkdir /keytabs + +# Install python libraries +RUN pip3 install pyhive[hive] +RUN pip3 install tornado==5.0.2 + + +# Add vflow user and vflow group to prevent error +# container has runAsNonRoot and image will run as root +RUN groupadd -g 1972 vflow && useradd -g 1972 -u 1972 -m vflow +USER 1972:1972 +WORKDIR /home/vflow +ENV HOME=/home/vflow \ No newline at end of file diff --git a/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json b/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json new file mode 100644 index 0000000..9470d57 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/dockerfiles/examples/HiveOperator/Tags.json @@ -0,0 +1,5 @@ +{ + "pyhive": "pip3", + "python36": "", + "tornado": "5.0.2" +} \ No newline at end of file diff --git a/HiveOperator/src/py3/vrep/vflow/graphs/HiveOperator_test/graph.json b/HiveOperator/src/py3/vrep/vflow/graphs/HiveOperator_test/graph.json new file mode 100644 index 0000000..c299c46 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/graphs/HiveOperator_test/graph.json @@ -0,0 +1,64 @@ +{ + "properties": {}, + "description": "Hive Operation", + "processes": { + "terminal1": { + "component": "com.sap.util.terminal", + "metadata": { + "label": "Terminal", + "x": 241.99999904632568, + "y": 39.99999952316284, + "height": 80, + "width": 120, + "ui": "dynpath", + "subengines": [ + "main" + ], + "config": {} + } + }, + "hiveoperator1": { + "component": "examples.HiveOperator", + "metadata": { + "label": "hiveOperator", + "x": 72.99999904632568, + "y": 39.99999952316284, + "height": 80, + "width": 120, + "extensible": true, + "config": {} + } + } + }, + "groups": [], + "connections": [ + { + "metadata": { + "points": "196.99999904632568,79.99999952316284 236.99999904632568,79.99999952316284" + }, + "src": { + "port": "output", + "process": "hiveoperator1" + }, + "tgt": { + "port": "in1", + "process": "terminal1" + } + }, + { + "metadata": { + "points": "365.9999990463257,79.99999952316284 393.9999985694885,79.99999952316284 393.9999985694885,12 12,12 12,79.99999952316284 67.99999904632568,79.99999952316284" + }, + "src": { + "port": "out1", + "process": "terminal1" + }, + "tgt": { + "port": "inSql", + "process": "hiveoperator1" + } + } + ], + "inports": {}, + "outports": {} +} \ No newline at end of file diff --git a/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/Apache_Hive_logo.svg b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/Apache_Hive_logo.svg new file mode 100644 index 0000000..031ad38 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/Apache_Hive_logo.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/configSchema.json b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/configSchema.json new file mode 100644 index 0000000..0aec337 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/configSchema.json @@ -0,0 +1,97 @@ +{ + "$schema": "http://json-schema.org/draft-06/schema#", + "$id": "http://sap.com/vflow/examples.HiveOperator.configSchema.json", + "type": "object", + "properties": { + "codelanguage": { + "type": "string", + "sap_vflow_constraints": { + "ui_visibility": false + } + }, + "script": { + "type": "string", + "sap_vflow_constraints": { + "ui_visibility": false + } + }, + "delimiter": { + "title": "Delimiter", + "description": "Delimit Hive results using a special character", + "type": "string" + }, + "hive_hostname": { + "title": "Hive Hostname", + "type": "string" + }, + "http_mode": { + "title": "HTTP-mode enabled", + "type": "boolean" + }, + "port": { + "title": "Hive Port", + "type": "number" + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "database": { + "title": "Hive Database", + "type": "string" + }, + "kerberos_enabled": { + "title": "Kerberos enabled", + "type": "boolean" + }, + "kerberos_principal": { + "title": "Kerberos Principal", + "type": "string", + "sap_vflow_constraints": { + "ui_visibility": [ + { + "name": "kerberos_enabled", + "value": true, + "text": "kerberos_enabled=true" + } + ] + } + }, + "kerberos_realm": { + "type": "string", + "sap_vflow_constraints": { + "ui_visibility": [ + { + "name": "kerberos_enabled", + "value": true, + "text": "kerberos_enabled=true" + } + ] + } + }, + "kerberos_keytab_filename": { + "type": "string", + "sap_vflow_constraints": { + "ui_visibility": [ + { + "name": "kerberos_enabled", + "value": true, + "text": "kerberos_enabled=true" + } + ] + } + } + }, + "required": [ + "delimiter", + "hive_hostname", + "http_mode", + "port", + "database", + "kerberos_enabled" + ] +} \ No newline at end of file diff --git a/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/operator.json b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/operator.json new file mode 100644 index 0000000..60b1bc3 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/operator.json @@ -0,0 +1,37 @@ +{ + "description": "hiveOperator", + "component": "com.sap.system.python3Operator", + "inports": [ + { + "name": "inSql", + "type": "string" + } + ], + "outports": [ + { + "name": "output", + "type": "string" + } + ], + "icon": "puzzle-piece", + "iconsrc": "Apache_Hive_logo.svg", + "config": { + "$type": "http://sap.com/vflow/examples.HiveOperator.configSchema.json", + "database": "default", + "delimiter": ";", + "hive_hostname": "fqdn.hive.com", + "http_mode": false, + "kerberos_enabled": false, + "kerberos_keytab_filename": "vora.keytab", + "kerberos_principal": "vora", + "kerberos_realm": "AD.HADOOP", + "port": 10000, + "script": "file://script.py", + "username": "hive" + }, + "tags": { + "pyhive": "pip3", + "python36": "", + "tornado": "5.0.2" + } +} \ No newline at end of file diff --git a/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/script.py b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/script.py new file mode 100644 index 0000000..70833f9 --- /dev/null +++ b/HiveOperator/src/py3/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/script.py @@ -0,0 +1,59 @@ +from pyhive import hive +from subprocess import call +import base64 +from thrift.transport.THttpClient import THttpClient + +hostname = api.config.hive_hostname +port = api.config.port +user = api.config.username +password = api.config.password +database= api.config.database +kerberos_enabled = api.config.kerberos_enabled +kerberos_keytab = "/keytabs/" + api.config.kerberos_keytab_filename +kerberos_principal = api.config.kerberos_principal + "@" + api.config.kerberos_realm +http_enabled = api.config.http_mode + +if(kerberos_enabled): + call(["cp","/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/"+api.config.kerberos_keytab_filename,"/keytabs"]) + call(["cp","/vrep/vflow/subengines/com/sap/python36/operators/examples/HiveOperator/krb5.conf","/etc"]) + call(["/usr/bin/kinit","-kt",kerberos_keytab, kerberos_principal]) + +def on_input(inSql): + hiveconnection(inSql) + +def add_http_mode_support(username=user, password=password, port=port, httpPath="/cliservice", host=hostname, transportMode="http"): + ap = "%s:%s" % (username, password) + _transport = THttpClient(host,port=port,path=httpPath) + _transport.setCustomHeaders({"Authorization": "Basic "+base64.b64encode(ap).strip()}) + return _transport + +def hiveconnection(inSql): + if(kerberos_enabled): + auth = "KERBEROS" + kerberos_service_name = "hive" + password = None + else: + password = api.config.password + auth = 'CUSTOM' + kerberos_service_name = None + + if(http_enabled): + conn = hive.connect(thrift_transport=add_http_mode_support()) + else: + conn = hive.Connection(host=hostname, port=port, username=user,password=password,database=database, auth=auth,kerberos_service_name=kerberos_service_name) + + cur = conn.cursor() + cur.execute(inSql) + resultList = cur.fetchall() + + string = "" + for x in resultList: + for y in x: + string = string + str(y) + api.config.delimiter ## Delimiter to separate Hive columns in output + string = string + "\n" + + api.send("output",string) + + + +api.set_port_callback("inSql", on_input) \ No newline at end of file diff --git a/HiveOperator/src/py3/vsolution.json b/HiveOperator/src/py3/vsolution.json new file mode 100644 index 0000000..f83b34a --- /dev/null +++ b/HiveOperator/src/py3/vsolution.json @@ -0,0 +1 @@ +{"name":"HiveOperator","version":"1.1","AdditionalFields":{"manifest":{"name":"HiveOperator","version":"1.1"},"paths":["/vflow"]}} \ No newline at end of file