diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..52d9ed1 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,38 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python application + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: create-json + id: create-json + uses: jsdaniell/create-json@v1.2.3 + with: + name: "credentials.json" + json: ${{ secrets.GOOGLE_SHEETS_KEY_FILE }} + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib + - name: Run Python script to convert file into token + run: | + python ci_scripts/set_env_vars_for_tests.py \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 8950b5d..d2556e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ set(EXTENSION_SOURCES src/gsheets_requests.cpp src/gsheets_read.cpp src/gsheets_utils.cpp + src/gsheets_get_token.cpp ) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) diff --git a/ci_scripts/set_env_vars_for_tests.py b/ci_scripts/set_env_vars_for_tests.py new file mode 100644 index 0000000..c94e359 --- /dev/null +++ b/ci_scripts/set_env_vars_for_tests.py @@ -0,0 +1,28 @@ +import os +from google.oauth2 import service_account +from google.auth.transport.requests import Request + +def get_token_from_user_file(user_file_path): + SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] + + credentials = service_account.Credentials.from_service_account_file( + user_file_path, + scopes=SCOPES + ) + + request = Request() + credentials.refresh(request) + return credentials.token + +key_file_path = "credentials.json" +token = get_token_from_user_file(key_file_path) + +env_file = os.getenv('GITHUB_ENV') + +with open(env_file, "a") as myfile: + # Set the token as an env var for some tests + myfile.write(f"TOKEN={token}\n") + # Set the key_file filepath as an env var for other tests + myfile.write(f"KEY_FILE_PATH={key_file_path}") + +print('It seems to have worked?') diff --git a/docs/pages/index.md b/docs/pages/index.md index e43993a..61ba65c 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -40,6 +40,15 @@ CREATE SECRET ( PROVIDER access_token, TOKEN '' ); + +-- OR create a non-expiring JSON secret with your Google API private key +-- (This enables use in non-interactive workflows like data pipelines) +-- (see "Getting a Google API Access Private Key" below) +CREATE SECRET ( + TYPE gsheet, + PROVIDER key_file, + FILEPATH '' +); ``` ### Read @@ -114,6 +123,22 @@ To connect DuckDB to Google Sheets via an access token, you’ll need to create This token will periodically expire - you can re-run the above command again to generate a new one. +## Getting a Google API Access Private Key + +Follow steps 1-9 above to get a JSON file with your private key inside. + +Include the path to the file as the `FILEPATH` parameter when creating a secret. +Ex: `CREATE SECRET (TYPE gsheet, PROVIDER key_file, FILEPATH '');` + +You can skip steps 10, 11, and 12 since this extension will convert from your JSON file to a token on your behalf! +The contents of the JSON file will be stored in the secret, as will the temporary token. + +Follow steps 13 and 14. + +This private key by default will not expire. Use caution with it. + +This will also require an additional API request approximately every 30 minutes. + ## Limitations / Known Issues - DuckDB WASM is not (yet) supported. diff --git a/src/gsheets_auth.cpp b/src/gsheets_auth.cpp index b1483b6..006f406 100644 --- a/src/gsheets_auth.cpp +++ b/src/gsheets_auth.cpp @@ -1,11 +1,15 @@ #include "gsheets_auth.hpp" #include "gsheets_requests.hpp" #include "gsheets_utils.hpp" +#include "gsheets_get_token.hpp" #include "duckdb/common/exception.hpp" #include "duckdb/main/secret/secret.hpp" #include "duckdb/main/extension_util.hpp" #include #include +#include + +using json = nlohmann::json; namespace duckdb { @@ -78,6 +82,42 @@ namespace duckdb return std::move(result); } + // TODO: Maybe this should be a KeyValueSecret + static unique_ptr CreateGsheetSecretFromKeyFile(ClientContext &context, CreateSecretInput &input) { + auto scope = input.scope; + + auto result = make_uniq(scope, input.type, input.provider, input.name); + + // Want to store the private key and email in case the secret is persisted + std::string filepath_key = "filepath"; + auto filepath = (input.options.find(filepath_key)->second).ToString(); + + std::ifstream ifs(filepath); + json credentials_file = json::parse(ifs); + std::string email = credentials_file["client_email"].get(); + std::string secret = credentials_file["private_key"].get(); + + // Manage specific secret option + (*result).secret_map["email"] = Value(email); + (*result).secret_map["secret"] = Value(secret); + CopySecret("filepath", input, *result); // Store the filepath anyway + + const auto result_const = *result; + TokenDetails token_details = get_token(context, &result_const); + std::string token = token_details.token; + + (*result).secret_map["token"] = Value(token); + (*result).secret_map["token_expiration"] = Value(token_details.expiration_time); + + // Redact sensible keys + RedactCommonKeys(*result); + result->redact_keys.insert("secret"); + result->redact_keys.insert("filepath"); + result->redact_keys.insert("token"); + + return std::move(result); + } + void CreateGsheetSecretFunctions::Register(DatabaseInstance &instance) { string type = "gsheet"; @@ -100,6 +140,12 @@ namespace duckdb oauth_function.named_parameters["use_oauth"] = LogicalType::BOOLEAN; RegisterCommonSecretParameters(oauth_function); ExtensionUtil::RegisterFunction(instance, oauth_function); + + // Register the key_file secret provider + CreateSecretFunction key_file_function = {type, "key_file", CreateGsheetSecretFromKeyFile}; + key_file_function.named_parameters["filepath"] = LogicalType::VARCHAR; + RegisterCommonSecretParameters(key_file_function); + ExtensionUtil::RegisterFunction(instance, key_file_function); } std::string InitiateOAuthFlow() diff --git a/src/gsheets_copy.cpp b/src/gsheets_copy.cpp index e952eee..784036e 100644 --- a/src/gsheets_copy.cpp +++ b/src/gsheets_copy.cpp @@ -7,6 +7,9 @@ #include "duckdb/common/file_system.hpp" #include "duckdb/main/secret/secret_manager.hpp" #include +#include +#include "gsheets_get_token.hpp" +#include using json = nlohmann::json; @@ -49,12 +52,22 @@ namespace duckdb throw InvalidInputException("Invalid secret format for 'gsheet' secret"); } - Value token_value; - if (!kv_secret->TryGetValue("token", token_value)) { - throw InvalidInputException("'token' not found in 'gsheet' secret"); - } + std::string token; + + if (secret.GetProvider() == "key_file") { + // If using a private key, retrieve the private key from the secret, but convert it + // into a token before use. This is an extra request per 30 minutes. + // The secret is the JSON file that is extracted from Google as per the README + token = get_token_and_cache(context, transaction, kv_secret); - std::string token = token_value.ToString(); + } else { + Value token_value; + if (!kv_secret->TryGetValue("token", token_value)) { + throw InvalidInputException("'token' not found in 'gsheet' secret"); + } + + token = token_value.ToString(); + } std::string spreadsheet_id = extract_spreadsheet_id(file_path); std::string sheet_id = extract_sheet_id(file_path); std::string sheet_name = "Sheet1"; diff --git a/src/gsheets_get_token.cpp b/src/gsheets_get_token.cpp new file mode 100644 index 0000000..76b074f --- /dev/null +++ b/src/gsheets_get_token.cpp @@ -0,0 +1,236 @@ +// Taken with modifications from https://gist.github.com/niuk/6365b819a86a7e0b92d82328fcf87da5 +#include +#include +#include +#include +#include "gsheets_requests.hpp" +#include "gsheets_utils.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/main/secret/secret_manager.hpp" +#include "gsheets_get_token.hpp" +#include "gsheets_auth.hpp" + +#include +#include + +#define CPPHTTPLIB_OPENSSL_SUPPORT +#ifndef NOMINMAX +#define NOMINMAX +#endif + +#include +using json = nlohmann::json; +namespace duckdb +{ + + char get_base64_char(char byte) { + if (byte < 26) { + return 'A' + byte; + } else if (byte < 52) { + return 'a' + byte - 26; + } else if (byte < 62) { + return '0' + byte - 52; + } else if (byte == 62) { + return '-'; + } else if (byte == 63) { + return '_'; + } else { + fprintf(stderr, "BAD BYTE: %02x\n", byte); + exit(1); + return 0; + } + } + + // To execute C, please define "int main()" + void base64encode(char *output, const char *input, size_t input_length) { + size_t input_index = 0; + size_t output_index = 0; + for (; input_index < input_length; ++output_index) { + switch (output_index % 4) { + case 0: + output[output_index] = get_base64_char((0xfc & input[input_index]) >> 2); + break; + case 1: + output[output_index] = get_base64_char(((0x03 & input[input_index]) << 4) | ((0xf0 & input[input_index + 1]) >> 4)); + ++input_index; + break; + case 2: + output[output_index] = get_base64_char(((0x0f & input[input_index]) << 2) | ((0xc0 & input[input_index + 1]) >> 6)); + ++input_index; + break; + case 3: + output[output_index] = get_base64_char(0x3f & input[input_index]); + ++input_index; + break; + default: + exit(1); + } + } + + output[output_index] = '\0'; + } + + TokenDetails get_token(ClientContext &context, const KeyValueSecret* kv_secret) { + const char *header = "{\"alg\":\"RS256\",\"typ\":\"JWT\"}"; + + /* Create jwt claim set */ + json jwt_claim_set; + std::time_t t = std::time(NULL); + + Value email_value; + if (!kv_secret->TryGetValue("email", email_value)) { + throw InvalidInputException("'email' not found in 'gsheet' secret"); + } + std::string email_string = email_value.ToString(); + + Value secret_value; + if (!kv_secret->TryGetValue("secret", secret_value)) { + throw InvalidInputException("'secret' (private_key) not found in 'gsheet' secret"); + } + std::string secret_string = secret_value.ToString(); + + jwt_claim_set["iss"] = email_string; /* service account email address */ + jwt_claim_set["scope"] = "https://www.googleapis.com/auth/spreadsheets" /* scope of requested access token */; + jwt_claim_set["aud"] = "https://accounts.google.com/o/oauth2/token"; /* intended target of the assertion for an access token */ + jwt_claim_set["iat"] = std::to_string(t); /* issued time */ + // Max time that Google allows is 1 hour, so set to 30 minutes as a buffer + // Set to t+5 for testing purposes + std::string expiration_time = std::to_string(t+1800); + jwt_claim_set["exp"] = expiration_time; /* expire time*/ + + char header_64[1024]; + base64encode(header_64, header, strlen(header)); + + char claim_set_64[1024]; + base64encode(claim_set_64, jwt_claim_set.dump().c_str(), strlen(jwt_claim_set.dump().c_str())); + + char input[1024]; + int input_length = sprintf(input, "%s.%s", header_64, claim_set_64); + + unsigned char *digest = SHA256((const unsigned char *)input, input_length, NULL); + char digest_str[1024]; + for (int i = 0; i < SHA256_DIGEST_LENGTH; ++i) { + sprintf(digest_str + i * 2, "%02x", digest[i]); + } + + digest_str[SHA256_DIGEST_LENGTH * 2] = '\0'; + + BIO* bio = BIO_new(BIO_s_mem()); + const void * private_key_pointer = secret_string.c_str(); + int private_key_length = std::strlen(secret_string.c_str()); + BIO_write(bio, private_key_pointer, private_key_length); + EVP_PKEY* evp_key = PEM_read_bio_PrivateKey(bio, NULL, NULL, NULL); + RSA* rsa = EVP_PKEY_get1_RSA(evp_key); + + if (rsa != NULL) { + unsigned char sigret[4096] = {}; + unsigned int siglen; + if (RSA_sign(NID_sha256, digest, SHA256_DIGEST_LENGTH, sigret, &siglen, rsa)) { + if (RSA_verify(NID_sha256, digest, SHA256_DIGEST_LENGTH, sigret, siglen, rsa)) { + char signature_64[1024]; + base64encode(signature_64, (const char *)sigret, siglen); + + char jwt[1024]; + sprintf(jwt, "%s.%s", input, signature_64); + + std::string body = "grant_type=urn:ietf:params:oauth:grant-type:jwt-bearer&assertion=" + std::string(jwt); + std::string response = perform_https_request("oauth2.googleapis.com", "/token", "", + HttpMethod::POST, + body, + "application/x-www-form-urlencoded"); + json response_json = parseJson(response); + std::string token = response_json["access_token"].get(); + TokenDetails result = {token, expiration_time}; + return result; + } else { + printf("Could not verify RSA signature."); + } + } else { + unsigned long err = ERR_get_error(); + printf("RSA_sign failed: %lu, %s\n", err, ERR_error_string(err, NULL)); + } + + RSA_free(rsa); + } + + throw InvalidInputException("Conversion from private key to token failed. Check email, key format in JSON file (-----BEGIN PRIVATE KEY-----\\n ... -----END PRIVATE KEY-----\\n), and expiration date."); + + } + + std::string get_token_and_cache(ClientContext &context, CatalogTransaction &transaction, const KeyValueSecret* kv_secret) { + + // Check if the token exists and has not expired. If so, use it + Value token_value; + Value token_expiration_value; + if (kv_secret->TryGetValue("token", token_value)) { + std::string token_string = token_value.ToString(); + + if (kv_secret->TryGetValue("token_expiration", token_expiration_value)) { + std::string token_expiration_string = token_expiration_value.ToString(); + // std::cout << "token_expiration_string: " << token_expiration_string << std::endl; + + std::time_t expiration_time = static_cast(std::stod(token_expiration_string)); + std::time_t current_time = std::time(NULL); + // std::cout << "expiration_time: " << expiration_time << " current_time: " << current_time << std::endl; + if (expiration_time > current_time) { + return token_string; + } + } + } + + // If we haven't returned yet, then there is no token or it is expired. + // std::cout << "Token does not exist or is expired!" << std::endl; + TokenDetails token_details = get_token(context, kv_secret); + + // Cache the token in a new secret + auto &secret_manager = SecretManager::Get(context); + auto secret_name = kv_secret->GetName(); + auto old_secret = secret_manager.GetSecretByName(transaction, secret_name); + auto persist_type = old_secret->persist_type; + auto storage_mode = old_secret->storage_mode; + CreateSecretInfo create_secret_info = CreateSecretInfo(OnCreateConflict::REPLACE_ON_CONFLICT, persist_type); + + // Copy the old secret (to get metadata about the secret we want to maintain) + auto new_secret = old_secret->secret->Clone().get(); + auto new_secret_kv = dynamic_cast(*old_secret->secret); + + // Add in the new token and expiration date + case_insensitive_map_t new_options; + new_options["token"] = Value(token_details.token); + new_options["token_expiration"] = Value(token_details.expiration_time); + // std::cout << "About to set new expiration_time: " << token_details.expiration_time << std::endl; + + // Create a new secret based on the old secret metadata, but with new token and expiration + auto new_token_input = CreateSecretInput { + new_secret_kv.GetType(), + new_secret_kv.GetProvider(), + "", // I don't know what storage_type to put, and it shouldn't be needed + new_secret_kv.GetName(), + new_secret_kv.GetScope(), + new_options + }; + auto successfully_set_token = new_secret_kv.TrySetValue("token", new_token_input); + auto successfully_set_token_expiration = new_secret_kv.TrySetValue("token_expiration", new_token_input); + // std::cout << "successfully_set_token: " << successfully_set_token << std::endl; + // std::cout << "successfully_set_token_expiration: " << successfully_set_token_expiration << std::endl; + + // Then register the secret with the OnCreateConflict set to REPLACE_ON_CONFLICT + secret_manager.RegisterSecret( + transaction, + make_uniq(new_secret_kv), + OnCreateConflict::REPLACE_ON_CONFLICT, + persist_type, + storage_mode + ); + + // To make sure saving the secret worked end to end, return the token from the new secret + auto repulled_new_secret = secret_manager.GetSecretByName(transaction, secret_name); + auto repulled_new_secret_kv = dynamic_cast(*repulled_new_secret->secret); + Value repulled_token_value; + if (!repulled_new_secret_kv.TryGetValue("token", repulled_token_value)) { + throw InvalidInputException("'token' not found in repulled 'gsheet' secret after caching"); + } + std::string token_string = repulled_token_value.ToString(); + return token_string; + } +} diff --git a/src/gsheets_read.cpp b/src/gsheets_read.cpp index f7a0b66..1d72ebb 100644 --- a/src/gsheets_read.cpp +++ b/src/gsheets_read.cpp @@ -6,6 +6,8 @@ #include #include #include +#include "gsheets_get_token.hpp" +#include namespace duckdb { @@ -135,12 +137,22 @@ unique_ptr ReadSheetBind(ClientContext &context, TableFunctionBind throw InvalidInputException("Invalid secret format for 'gsheet' secret"); } - Value token_value; - if (!kv_secret->TryGetValue("token", token_value)) { - throw InvalidInputException("'token' not found in 'gsheet' secret"); - } + std::string token; + + if (secret.GetProvider() == "key_file") { + // If using a private key, retrieve the private key from the secret, but convert it + // into a token before use. This is an extra request per 30 minutes. + // The secret is the JSON file that is extracted from Google as per the README + token = get_token_and_cache(context, transaction, kv_secret); - std::string token = token_value.ToString(); + } else { + Value token_value; + if (!kv_secret->TryGetValue("token", token_value)) { + throw InvalidInputException("'token' not found in 'gsheet' secret"); + } + + token = token_value.ToString(); + } // Parse named parameters for (auto &kv : input.named_parameters) { diff --git a/src/include/gsheets_get_token.hpp b/src/include/gsheets_get_token.hpp new file mode 100644 index 0000000..a1d92d5 --- /dev/null +++ b/src/include/gsheets_get_token.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include "duckdb/main/secret/secret_manager.hpp" + +namespace duckdb { + + char get_base64_char(char byte); + + void base64encode(char *output, const char *input, size_t input_length) ; + + struct TokenDetails { + std::string token; + std::string expiration_time; + }; + + TokenDetails get_token(ClientContext &context, const KeyValueSecret* kv_secret) ; + std::string get_token_and_cache(ClientContext &context, CatalogTransaction &transaction, const KeyValueSecret* kv_secret) ; + + +} \ No newline at end of file diff --git a/test/sql/copy_to_key_file.test b/test/sql/copy_to_key_file.test new file mode 100644 index 0000000..d8ba35f --- /dev/null +++ b/test/sql/copy_to_key_file.test @@ -0,0 +1,53 @@ +# name: test/sql/copy_to_key_file.test +# description: test use of key_file for auth to copy to gsheet +# group: [gsheets] + +# The key_file should be in the JSON format that Google exports +# (note that newline characters should not be escaped in the private key) +require-env KEY_FILE_PATH + +# Before we load the extension, this will fail +statement error +FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', header=true); +---- +Catalog Error: Table Function with name read_gsheet does not exist! + +# Require statement will ensure this test is run with this extension loaded +require gsheets + +# Create a secret NB must substitute a token, do not commit! +statement ok +create secret test_secret ( + type gsheet, + provider key_file, + filepath '${KEY_FILE_PATH}' +); + +# Create a table to copy to Google Sheet +statement ok +create table spreadsheets as +select 'Microsoft' as company, 'Excel' as product, 1985 as year_founded +union all +select 'Google', 'Google Sheets', 2006 +union all +select 'Apple', 'Numbers', 1984 +union all +select 'LibreOffice', 'Calc', 2000; + +# Copy the table to Google Sheet +statement ok +copy spreadsheets to 'https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1295634987#gid=1295634987' (format gsheet); + +# Read the table from Google Sheet +query III +from read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1295634987#gid=1295634987'); +---- +Microsoft Excel 1985 +Google Google Sheets 2006 +Apple Numbers 1984 +LibreOffice Calc 2000 + + +# Drop the secret +statement ok +drop secret test_secret; \ No newline at end of file diff --git a/test/sql/read_gsheet_key_file.test b/test/sql/read_gsheet_key_file.test new file mode 100644 index 0000000..36f7b8d --- /dev/null +++ b/test/sql/read_gsheet_key_file.test @@ -0,0 +1,35 @@ +# name: test/sql/read_gsheet_key_file.test +# description: test use of key_file for auth for read_gsheet() function +# group: [gsheets] + +# The key_file should be in the JSON format that Google exports +# (note that newline characters should not be escaped in the private key) +require-env KEY_FILE_PATH + +# Before we load the extension, this will fail +statement error +FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', header=true); +---- +Catalog Error: Table Function with name read_gsheet does not exist! + +# Require statement will ensure this test is run with this extension loaded +require gsheets + +# Create a secret NB must substitute a token, do not commit! +statement ok +create secret test_secret ( + type gsheet, + provider key_file, + filepath '${KEY_FILE_PATH}' +); + +# Confirm the key_file works +query II +FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=528658050#gid=528658050', header=true); +---- +woot 0 +woot 9001 + +# Drop the secret +statement ok +drop secret test_secret; \ No newline at end of file