From d40412fcb3ff5a506c4be4b4bc01aecb5de52a33 Mon Sep 17 00:00:00 2001 From: dianaxnav Date: Thu, 18 Apr 2024 19:11:46 +0000 Subject: [PATCH 1/2] "Adding" --- week3/database.db.wal | 0 week3/script.sql | 104 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 week3/database.db.wal create mode 100644 week3/script.sql diff --git a/week3/database.db.wal b/week3/database.db.wal new file mode 100644 index 0000000..e69de29 diff --git a/week3/script.sql b/week3/script.sql new file mode 100644 index 0000000..16dcdfe --- /dev/null +++ b/week3/script.sql @@ -0,0 +1,104 @@ +SELECT * FROM Species; +.tables +--SQL is case-insensitive +select * from species; +-- limiting rows +SELECT * FROM Species LIMIT 5; +SELECT * FROM Species LIMIT 5 OFFSET 5; +-- How many rows? +SELECT COUNT(*) FROM Species; +-- if put column name in count(), how many non-NULL values? +SELECT COUNT(Scientific_name) FROM Species; +-- How mant distinct values occur? +SELECT DISTINCT Species FROM Bird_nests; + +-- Lecture 2 +.maxrow 6 +SELECT Location FROM Site; +SELECT * FROM Site WHERE Area < 200; +SELECT * FROM Site WHERE Area <200 AND Location LIKE '%USA'; + +--expressions +SELECT Site_name, Area FROM Site; +SELECT Site_name, Area*2.47 FROM Site; +SELECT Site_name, Area*2.47 AS Area_acres FROM Site; +SELECT Site_name || 'foo' FROM Site; + +-- aggregation functions +SELECT COUNT(*) FROM Site; +SELECT COUNT(*) AS num_rows FROM Site; +SELECT Site_name, Area*2.47 AS Area_acres FROM Site; +SELECT COUNT(Scientific_name) FROM Species; +SELECT DISTINCT Relevance FROM Species; +SELECT COUNT(DISTINCT Relevance) FROM Species; + +--MIN, MAX, AVG +SELECT AVG(Area) FROM Site; + +-- grouping +SELECT * FROM Site; +SELECT Location, MAX(Area) + FROM Site + GROUP BY Location; +SELECT Location, COUNT(*) + FROM Site + GROUP BY Location; +SELECT* FROM Species; +SELECT Relevance, COUNT(*) + FROM Species + GROUP BY Relevance; +SELECT Location, MAX(Area) + GROUP BY Location; +SELECT Location, MAX(Area) + FROM Site + WHERE Location LIKE '%Canada' + GROUP BY Location; +SELECT Location, MAX(Area) AS Max_area + FROM Site + WHERE Location LIKE '%Canada' + GROUP BY Location + HAVING Max_area>200; +-- relational algebra peeks through! +select count(*) FROM Site; +SELECT COUNT(*) FROM (SELECT COUNT(*) FROM Site ); +select * from Bird_nests LIMIT 3; +SELECT COUNT(*) FROM Species; +SELECT * FROM Species + WHERE Code NOT IN (SELECT DISTINCT Species FROM Bird_nests); + +--saving queries +CREATE TEMP TABLE t AS + SELECT * FROM Species + WHERE Code NOT IN (SELECT DISTINCT Species FROM Bird_nests); +--call the table +SELECT * FROM t; +SELECT * FROM t_perm; +DROP TABLE t_perm; + +-- NULL processing +SELECT COUNT(*) FROM Bird_nests + WHERE floatAge <= 5; +SELECT count(*) from Bird_nests; +--THIS IS THE INCORRECT SYNTAX WILL OUTPUT 0 +SELECT count(*) from Bird_nests WHERE floatAge = NULL; +-- SPECIFY 'IS NULL' +SELECT COUNT(*) FROM Bird_nests WHERE floatAge IS NULL; + +--JOINS +SELECT * FROM Camp_assignment; +SELECT * from Personnel; +select * from Camp_assignment JOIN Personnel + ON Observer = Abbreviation; +SELECT * FROM Camp_assignment JOIN Personnel + ON Camp_assignment.Observer = Personnel.Abbreviation; +SELECT * FROM Camp_assignment AS ca JOIN Personnel p + ON ca.Observer = p.Abbreviation; +SELECT * FROM Camp_assignment ca JOIN Personnel p + ON ca.Observer = p.Abbreviation + JOIN Site s + ON ca.Site = s.Code + WHERE ca.Observer = 'lmckinnon' + LIMIT 3; + +--more on grouping +SELECT Nest_ID, COUNT(*) FROM Bird_eggs GROUP BY Nest_ID; From 5c38e0828cf4dc2984169afbe561ec120858d45e Mon Sep 17 00:00:00 2001 From: dianaxnav Date: Wed, 8 May 2024 17:39:02 +0000 Subject: [PATCH 2/2] Try this --- .gitignore | 4 + bren-meds213-spring-2024-class-data.Rproj | 13 + .../pandas-checkpoint.ipynb | 254 +++++ .../python_sql-checkpoint.ipynb | 912 ++++++++++++++++++ week3/database.db | Bin 4730880 -> 6041600 bytes week3/database.db.wal | 0 week3/pandas.ipynb | 254 +++++ week3/python_sql.ipynb | 912 ++++++++++++++++++ week6/dbplyr.qmd | 91 ++ 9 files changed, 2440 insertions(+) create mode 100644 .gitignore create mode 100644 bren-meds213-spring-2024-class-data.Rproj create mode 100644 week3/.ipynb_checkpoints/pandas-checkpoint.ipynb create mode 100644 week3/.ipynb_checkpoints/python_sql-checkpoint.ipynb delete mode 100644 week3/database.db.wal create mode 100644 week3/pandas.ipynb create mode 100644 week3/python_sql.ipynb create mode 100644 week6/dbplyr.qmd diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/bren-meds213-spring-2024-class-data.Rproj b/bren-meds213-spring-2024-class-data.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/bren-meds213-spring-2024-class-data.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/week3/.ipynb_checkpoints/pandas-checkpoint.ipynb b/week3/.ipynb_checkpoints/pandas-checkpoint.ipynb new file mode 100644 index 0000000..aaf6ca8 --- /dev/null +++ b/week3/.ipynb_checkpoints/pandas-checkpoint.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "a74b4cb1-722b-4e09-b511-7cd3e5b4a4cc", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: duckdb in /Users/dmnavarro/.local/lib/python3.7/site-packages (0.10.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "14680f2b-7b7e-4cc2-8515-6b258c30fc73", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fee61f08-59ba-4570-a45d-f2869088687b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conn = duckdb.connect(\"database.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cc5496dd-ad10-487b-a0fa-17b5e90d38c3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur = conn.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "75bdd2f4-5216-453c-bddb-dad8b108ea3c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2005,\n", + " 'bylo',\n", + " 'lmckinnon',\n", + " datetime.date(2005, 6, 1),\n", + " datetime.date(2005, 8, 5)),\n", + " (2005,\n", + " 'bylo',\n", + " 'blalibert',\n", + " datetime.date(2005, 6, 1),\n", + " datetime.date(2005, 8, 20)),\n", + " (2006,\n", + " 'bylo',\n", + " 'lmckinnon',\n", + " datetime.date(2006, 6, 1),\n", + " datetime.date(2006, 8, 5))]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Camp_assignment LIMIT 3\")\n", + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "064ae9ac-cca5-46db-bbd5-6f6cb4b225e3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "029c7e65-ea39-4c59-a0b4-ec5e4c8405c7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur.fetchone()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ebfbe6a2-046e-47a3-bd34-db2141b5a1a2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Observer dkessler gathered 69 nests\n", + "Observer bharrington gathered 245 nests\n", + "Observer lmckinnon gathered 249 nests\n", + "Observer dhodkinson gathered 15 nests\n", + "Observer bhill gathered 55 nests\n", + "Observer ssaalfeld gathered 13 nests\n", + "Observer wenglish gathered 18 nests\n", + "Observer lworing gathered 14 nests\n", + "Observer vloverti gathered 54 nests\n", + "Observer rlanctot gathered 40 nests\n", + "Observer abankert gathered 17 nests\n", + "Observer edastrous gathered 38 nests\n", + "Observer amould gathered 42 nests\n", + "Observer bkaselow gathered 4 nests\n", + "Observer jflamarre gathered 43 nests\n", + "Observer jzamuido gathered 11 nests\n", + "Observer mballvanzee gathered 2 nests\n", + "Observer mbwunder gathered 4 nests\n", + "Observer None gathered 0 nests\n", + "Observer kkalasz gathered 12 nests\n" + ] + } + ], + "source": [ + "inner_query = \"\"\"\n", + " SELECT COUNT(*) AS num_nests \n", + " FROM Bird_nests\n", + " WHERE Observer = ?\n", + "\"\"\"\n", + "\n", + "outer_query = \"\"\"\n", + " SELECT DISTINCT Observer FROM Bird_nests \n", + "\"\"\"\n", + "\n", + "for row in cur.execute(outer_query).fetchall():\n", + " observer = row[0]\n", + " cur2 = conn.cursor()\n", + " cur2.execute(inner_query, [observer])\n", + " print(f\"Observer {observer} gathered {cur2.fetchone()[0]} nests\")" + ] + }, + { + "cell_type": "markdown", + "id": "7fc40ac6-5763-4c8d-a4f1-5005e14adc8c", + "metadata": {}, + "source": [ + "Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50cd90c9-59ba-431d-9a16-8210b289e003", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e5f7d7b6-24a9-4b3a-9697-fceb0e267360", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_sql(\"SELECT * FROM Site\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c87eb5e-9c76-45e5-9503-9a80bcea69de", + "metadata": {}, + "outputs": [], + "source": [ + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week3/.ipynb_checkpoints/python_sql-checkpoint.ipynb b/week3/.ipynb_checkpoints/python_sql-checkpoint.ipynb new file mode 100644 index 0000000..cca3ec5 --- /dev/null +++ b/week3/.ipynb_checkpoints/python_sql-checkpoint.ipynb @@ -0,0 +1,912 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 43, + "id": "8ce928bf-26e0-4836-b5f5-b5a6cb48c1b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: duckdb in /Users/dmnavarro/.local/lib/python3.7/site-packages (0.10.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "6026a2fa-f928-4f43-9d75-81439649c5e2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import duckdb" + ] + }, + { + "cell_type": "markdown", + "id": "5d888446-47ec-4828-902e-b3f55875af95", + "metadata": {}, + "source": [ + "Create a connection and a server" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "699fb802-81cf-438d-97cb-0c1244a1aea0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conn = duckdb.connect(\"database.db\") " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "a186b429-a944-4ecf-9514-e5848cd8520e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "9217dd9a-1a96-4bf0-8e77-09ed49f429f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur = conn.cursor()" + ] + }, + { + "cell_type": "markdown", + "id": "d62899fa-37fd-4b5f-9953-ded1b4f2b0c7", + "metadata": {}, + "source": [ + "Now let's do something with our cursor " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "0f1ad8b3-4b45-4cc5-bc4e-38a71294f2f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Site LIMIT 5\")" + ] + }, + { + "cell_type": "markdown", + "id": "03d2cb6d-09ce-4c5f-ab45-85a4b6919434", + "metadata": {}, + "source": [ + "Now we want results..three ways of getting them:\n", + "1. All results at once" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "878b406e-2030-4bf5-ac98-475ff41c8fa4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('barr',\n", + " 'Barrow',\n", + " 'Alaska, USA',\n", + " 71.30000305175781,\n", + " -156.60000610351562,\n", + " 220.39999389648438),\n", + " ('burn',\n", + " 'Burntpoint Creek',\n", + " 'Ontario, Canada',\n", + " 55.20000076293945,\n", + " -84.30000305175781,\n", + " 63.0),\n", + " ('bylo',\n", + " 'Bylot Island',\n", + " 'Nunavut, Canada',\n", + " 73.19999694824219,\n", + " -80.0,\n", + " 723.5999755859375),\n", + " ('cakr',\n", + " 'Cape Krusenstern',\n", + " 'Alaska, USA',\n", + " 67.0999984741211,\n", + " -163.5,\n", + " 54.099998474121094),\n", + " ('cari',\n", + " 'Canning River Delta',\n", + " 'Alaska, USA',\n", + " 70.0999984741211,\n", + " -145.8000030517578,\n", + " 722.0)]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "dd80f837-5824-4019-8857-10549b0160b7", + "metadata": { + "tags": [] + }, + "source": [ + "Cursors don't have to store anything, they just transfer queries to the database and get results back. " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "fd4ad57d-0f83-4922-b30f-91068b6f3965", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "6980d7d7-3447-410b-a4b0-6f474230ead4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['14HPE1',\n", + " '11eaba',\n", + " '11eabaagc01',\n", + " '11eabaagv01',\n", + " '11eababbc02',\n", + " '11eababsv01',\n", + " '11eabaduh01',\n", + " '11eabaduv01',\n", + " '11eabarpc01',\n", + " '11eabarpc02']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT Nest_ID FROM Bird_nests LIMIT 10\")\n", + "[t[0] for t in cur.fetchall()]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "abeb04f7-187b-4cf8-971d-33627357ac62", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(1547,)]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchall() # tupple of one element" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "f838c66b-6085-413f-8b68-87b4837980ca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1547,)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# more convenient to say\n", + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchone()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "89abaa3c-e176-4eae-9fcf-6221208861e6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1547" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchone()[0]" + ] + }, + { + "cell_type": "markdown", + "id": "7d2c30e4-23d1-4967-9c8d-c2aa7abf5b06", + "metadata": {}, + "source": [ + "3. using an iterator - but Duckdb doesn't support iterators :(" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "92c9e224-0850-4f8b-8f1d-40681a9bae4d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "got nest ID 14HPE1\n", + "got nest ID 11eaba\n", + "got nest ID 11eabaagc01\n", + "got nest ID 11eabaagv01\n", + "got nest ID 11eababbc02\n", + "got nest ID 11eababsv01\n", + "got nest ID 11eabaduh01\n", + "got nest ID 11eabaduv01\n", + "got nest ID 11eabarpc01\n", + "got nest ID 11eabarpc02\n" + ] + } + ], + "source": [ + "cur.execute(\"SELECT Nest_ID FROM Bird_nests LIMIT 10\") \n", + "while True: \n", + " row = cur.fetchone()\n", + " if row == None: \n", + " break \n", + " #do something with row \n", + " print(f\"got nest ID {row[0]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b6cc8397-e24d-497d-8094-7f2552d4081a", + "metadata": {}, + "source": [ + "Can do things other than SELECT! " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "469fa88d-be07-435e-ac6a-529a548d94ba", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"\"\"\n", + " CREATE TEMP TABLE temp_table AS \n", + " SELECT * FROM Bird_nests LIMIT 10 \n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "078d2c35-5f02-449b-a29c-5e9a10dd2835", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM temp_table\") " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "71c8be9e-c9bb-44d4-9a61-fb211a0fcde1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('b14.6',\n", + " 2014,\n", + " 'chur',\n", + " '14HPE1',\n", + " 'sepl',\n", + " 'vloverti',\n", + " datetime.date(2014, 6, 14),\n", + " None,\n", + " 3,\n", + " None,\n", + " None),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eaba',\n", + " 'wrsa',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 10),\n", + " 'searcher',\n", + " 4,\n", + " None,\n", + " None),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaagc01',\n", + " 'amgp',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 24),\n", + " 'searcher',\n", + " 4,\n", + " 6.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaagv01',\n", + " 'amgp',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 25),\n", + " 'searcher',\n", + " 3,\n", + " 3.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eababbc02',\n", + " 'bbpl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 24),\n", + " 'searcher',\n", + " 4,\n", + " 4.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eababsv01',\n", + " 'wrsa',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 7),\n", + " 'searcher',\n", + " 4,\n", + " 2.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaduh01',\n", + " 'dunl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 28),\n", + " 'searcher',\n", + " 3,\n", + " 2.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaduv01',\n", + " 'dunl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 29),\n", + " 'searcher',\n", + " 4,\n", + " 5.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabarpc01',\n", + " 'reph',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 8),\n", + " 'searcher',\n", + " 4,\n", + " 4.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabarpc02',\n", + " 'reph',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 8),\n", + " 'searcher',\n", + " 3,\n", + " 4.0,\n", + " 'float')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "bd254336-078a-4799-9193-f69815b1abb2", + "metadata": { + "tags": [] + }, + "source": [ + "A note on fragility\n", + "\n", + "For example: \n", + "```\n", + "INSERT INTO Site VALUES (\"abcd\", \"Foo\", 35.7, -119.5, \"?\") \n", + "```\n", + "\n", + "A less fragile way of expressing the same thing: \n", + "```\n", + "INSERT INTO Site (Code, Site_name, latitude, longitude, Something_else) \n", + " VALUES(\"abcd\", \"Foo\", 35.7, -119.5, \"?\")\n", + "```\n", + "\n", + "In the same vein: SELECT * is fragile: " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "8fac3c49-de9c-4bb0-b7e9-853e26ac7eed", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('barr',\n", + " 'Barrow',\n", + " 'Alaska, USA',\n", + " 71.30000305175781,\n", + " -156.60000610351562,\n", + " 220.39999389648438),\n", + " ('burn',\n", + " 'Burntpoint Creek',\n", + " 'Ontario, Canada',\n", + " 55.20000076293945,\n", + " -84.30000305175781,\n", + " 63.0),\n", + " ('bylo',\n", + " 'Bylot Island',\n", + " 'Nunavut, Canada',\n", + " 73.19999694824219,\n", + " -80.0,\n", + " 723.5999755859375)]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Site LIMIT 3\") \n", + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "97b42950-3a0b-4004-9f8b-6b5535460fc6", + "metadata": { + "tags": [] + }, + "source": [ + "A better, more robust way of coding to the same thing:" + ] + }, + { + "cell_type": "markdown", + "id": "84a964cc-8338-4f36-9073-8d91209f9815", + "metadata": { + "tags": [] + }, + "source": [ + "cur.execute(\"SELECT Site_name, Code, Latitude, Longitude FROM Site LIMIT 3\")\n", + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "e10f7259-eb15-40dc-9493-a1813e384464", + "metadata": {}, + "source": [ + "An extended example: Question we're trying to answer: How many nests do we have for each species?\n", + "\n", + "Approach: first get all species. Then execute a count query for each species:" + ] + }, + { + "cell_type": "markdown", + "id": "6bd90210-2457-455c-a320-b72de7799f64", + "metadata": { + "tags": [] + }, + "source": [ + "A digression: string interpolation in Python " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "21edde7e-20cb-45ad-b971-8118241bc2e9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Greg\n" + ] + } + ], + "source": [ + "s= \"My name is %s\" \n", + "print(s % \"Greg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "71cd6b6a-3a00-407e-b739-254ad09dfbe8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Species agsq has 0 nests\n", + "Species amcr has 0 nests\n", + "Species amgp has 29 nests\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + " SELECT COUNT (*) FROM Bird_nests\n", + " WHERE Species = '%s'\n", + "\"\"\"\n", + "# for each species, stuff in code from species\n", + "# use our cursor\n", + "cur.execute(\"SELECT Code FROM Species LIMIT 3\")\n", + "for row in cur.fetchall(): # duckdb workaround\n", + " code = row[0]\n", + " prepared_query = query % code\n", + " #print(prepared_query)\n", + " cur2 = conn.cursor()\n", + " cur2.execute(prepared_query)\n", + " print(f\"Species {code} has {cur2.fetchone()[0]} nests\")\n", + " cur2.close()" + ] + }, + { + "cell_type": "markdown", + "id": "24b9c9af-8449-4cdc-b0a2-a4fc3defb8c7", + "metadata": {}, + "source": [ + "The above python interpolation is dangerous and has cuased many database hacks! There's a better way: " + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "b13af3aa-fbca-43bb-ac0f-02f2b5e69856", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Species agsq has 0 nests\n", + "Species amcr has 0 nests\n", + "Species amgp has 29 nests\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + " SELECT COUNT (*) FROM Bird_nests\n", + " WHERE Species = ?\n", + "\"\"\"\n", + "# for each species, stuff in code from species\n", + "# use our cursor\n", + "cur.execute(\"SELECT Code FROM Species LIMIT 3\")\n", + "for row in cur.fetchall(): # duckdb workaround\n", + " code = row[0]\n", + " # NOT NEEDED! prepared_query = query % code\n", + " #print(prepared_query)\n", + " cur2 = conn.cursor()\n", + " cur2.execute(query, [code]) # <--- added argument here \n", + " print(f\"Species {code} has {cur2.fetchone()[0]} nests\")\n", + " cur2.close()" + ] + }, + { + "cell_type": "markdown", + "id": "79a93a87-19d2-4b2a-9d6b-6019b1f96f9e", + "metadata": {}, + "source": [ + "Let's illustrate the danger with a different example" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "eb1953d6-7d42-42ea-8c36-ea43b9c8f91e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abbrev = \"TS\" \n", + "name = \"Taylor Swift\" \n", + "cur.execute(\"\"\"\n", + " INSERT INTO Personnel (Abbreviation, Name) \n", + " VALUES ('%s', '%s') \n", + " \"\"\" % (abbrev, name) \n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a22c6c67-7c32-4880-a9eb-e1178dfb5d98", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('emagnuson', 'Emily Magnuson'),\n", + " ('mcorrell', 'Maureen Correll'),\n", + " ('TS', 'Taylor Swift')]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Personnel\") \n", + "cur.fetchall()[-3:]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "35cf6b07-fa4e-451f-bc46-aacef0e36961", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abbrev = \"CO\"\n", + "name = \"Conan O'Brian\"\n", + "cur.execute( \"\"\"\n", + " INSERT INTO Personnel (Abbreviation, Name) \n", + " VALUES(?,?)\n", + " \"\"\", \n", + " [abbrev, name])" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "78e38f60-a929-4bb4-b558-c720d9bad035", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('mcorrell', 'Maureen Correll'),\n", + " ('TS', 'Taylor Swift'),\n", + " ('CO', \"Conan O'Brian\")]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Personnel\") \n", + "cur.fetchall()[-3:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a0ee988-511e-43a6-8cb9-58fcdf0eb336", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week3/database.db b/week3/database.db index ad492608ec1c07d330f753b7b9438a0667cd0bf2..753e61ec73d7d40a8b738a4f893fffe039dc314d 100644 GIT binary patch delta 5805 zcmeHLjZ;+D6~FhfyS#ou87Gp4`q7D%_}bi<+LqL`QQGlini;Ki1f5LO8I9Nvb<(NyyvN(X?sWPW zaA%f#?(d#^_MG22_we?$4R%XygF{lcn(mNGUWvF~(I=shp(dKkBrQF5z$vqsH=MIx z%Rlp$OoU8uH`hzZC~jKI+@+^#f{bWM1hq$bV0Yz+N&}Hs$E-I7KmgIK8pkSROOQh z7WLWGu1Y(%AIB-oSLL-D%36C>k;L{bo!C{G#?+v|_b4hmXxs}aURrgA&Lv7}ORLaA{W7gWW)u}dKG%rgL&t1g-#(ZBSt_r9Y z-nSL+X9G*(SZv*Utov*s%(+ZGv({L{EUF1!h5?zX{i%M69ZzMru3f;XNDL}B*ye|; zbd}9mo3!P8G@d5P2tI)!phULr9jn&cKFh&7O9~XG)a~MZvv4JxTHaNeq)iO`+QA#Z zA=sIxhnB6ovo8zp*X_NH`TCRi`G6X4KmpaQe-oVjLhJ6yk!mvh=pGd4XoLX#4JtAO z=uZd(PA7%|?WnVpy-`vQtQe0?lVo=CNX)_w^~f^nT_(8e82jA-^gF|V)3G6dcJ|tR zi)=+dQ~Q{ zDYq)D2~fdDwavG#T5y3@d3&1`7ipi~`w9oYV20NF09cqAa3$z9#mA+vhMbJpoEjl|m*dYB-cYf~0~Ytg zwXunu^?0$9uUUsv0csx!fZ)SWv)gX#p*)JvYj`@|mE%VTLaG9Qj*zMV;6XTW(=#?R zpz)XFFyId6pB@^E!Pbq166Go5ozc3@Sk?Z8;(=YUGMg-iA1e0$$Bxo-ki z!O$)*QCxL=i8Z#Lf1_1%7lIn(XKe z0N|$~HyD5)humNQXvhsN5RTKh*v5|m0eTzTZPmS_^L8`q?}Bi6p+~6RdJ5KsA8*ntcY#h85ii`rR~0;8lwHI* zQjkBP_e-7ZLP@7&g29WvmaW0QCmi44WGgsIfSGu<{sSi~?X!uOhVc*F8;ktta!BL= zoDGQ_fDb|<2jJb1Dg|I)NKpdtI1~^I+|sq^NNk5T(5o=@W<2}-*Y|ZlJAk**zmVdk zlvpUSQevYdk`kE`J0($+L{p+r5<`iD5+^0Gl(;B~qr^=~JS7Q~BvO(@iHDM8N>V79 zK*>Z(QYo24Ng5?8CFzu8P%@d4DU?j5WEv%zlw?toO-T+V(<#ZNWCkTODVaqHp(Kxz zd`f0hQb5TZO6F2hNJ$YT#gxpWq=XVLCG#m+KuIYjK1#|cSx8AaC5tFoOvy8pRK$q3 z-zY9h1W#G;Fsg)ljm)p2>8fpG@L z83g=>*1Dmg@%t^!$XUo=EmjgFJS6gntVZxbnrj!etCfF9 zmL&*9h*d8s2W5V2g{;`unH{ZyeY)5!JbvYz{=+ZneZ1;GwOSzGbjwPly)h}!AB zTk0A%$4KLb7z_dsa+`C4IqP`5DO+F;?&~iF!69Sla?nmT$iWwe`mCSPhi=D(5Z9t5 p7x$(~1^o3i=~+)uHTX{Psp%F~C8|aoQ7y6KTdrJt45^0%>EHU@XYi7Y zjR)d^{|`h-X_b_;Nm@c_mk#NaF6ovY$w*durBC`LCwUo=L5XBYhNU1QGAd&-E=4KH zgiOkmOv{YS%ACy0f-K6CEX#_l3bH2avLTzYCEKziyRs+yav+CtB*$_hr*bCeav_&; ICD(HE2L`UaeE \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "14680f2b-7b7e-4cc2-8515-6b258c30fc73", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fee61f08-59ba-4570-a45d-f2869088687b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conn = duckdb.connect(\"database.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cc5496dd-ad10-487b-a0fa-17b5e90d38c3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur = conn.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "75bdd2f4-5216-453c-bddb-dad8b108ea3c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2005,\n", + " 'bylo',\n", + " 'lmckinnon',\n", + " datetime.date(2005, 6, 1),\n", + " datetime.date(2005, 8, 5)),\n", + " (2005,\n", + " 'bylo',\n", + " 'blalibert',\n", + " datetime.date(2005, 6, 1),\n", + " datetime.date(2005, 8, 20)),\n", + " (2006,\n", + " 'bylo',\n", + " 'lmckinnon',\n", + " datetime.date(2006, 6, 1),\n", + " datetime.date(2006, 8, 5))]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Camp_assignment LIMIT 3\")\n", + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "064ae9ac-cca5-46db-bbd5-6f6cb4b225e3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "029c7e65-ea39-4c59-a0b4-ec5e4c8405c7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur.fetchone()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ebfbe6a2-046e-47a3-bd34-db2141b5a1a2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Observer dkessler gathered 69 nests\n", + "Observer bharrington gathered 245 nests\n", + "Observer lmckinnon gathered 249 nests\n", + "Observer dhodkinson gathered 15 nests\n", + "Observer bhill gathered 55 nests\n", + "Observer ssaalfeld gathered 13 nests\n", + "Observer wenglish gathered 18 nests\n", + "Observer lworing gathered 14 nests\n", + "Observer vloverti gathered 54 nests\n", + "Observer rlanctot gathered 40 nests\n", + "Observer abankert gathered 17 nests\n", + "Observer edastrous gathered 38 nests\n", + "Observer amould gathered 42 nests\n", + "Observer bkaselow gathered 4 nests\n", + "Observer jflamarre gathered 43 nests\n", + "Observer jzamuido gathered 11 nests\n", + "Observer mballvanzee gathered 2 nests\n", + "Observer mbwunder gathered 4 nests\n", + "Observer None gathered 0 nests\n", + "Observer kkalasz gathered 12 nests\n" + ] + } + ], + "source": [ + "inner_query = \"\"\"\n", + " SELECT COUNT(*) AS num_nests \n", + " FROM Bird_nests\n", + " WHERE Observer = ?\n", + "\"\"\"\n", + "\n", + "outer_query = \"\"\"\n", + " SELECT DISTINCT Observer FROM Bird_nests \n", + "\"\"\"\n", + "\n", + "for row in cur.execute(outer_query).fetchall():\n", + " observer = row[0]\n", + " cur2 = conn.cursor()\n", + " cur2.execute(inner_query, [observer])\n", + " print(f\"Observer {observer} gathered {cur2.fetchone()[0]} nests\")" + ] + }, + { + "cell_type": "markdown", + "id": "7fc40ac6-5763-4c8d-a4f1-5005e14adc8c", + "metadata": {}, + "source": [ + "Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50cd90c9-59ba-431d-9a16-8210b289e003", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e5f7d7b6-24a9-4b3a-9697-fceb0e267360", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_sql(\"SELECT * FROM Site\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c87eb5e-9c76-45e5-9503-9a80bcea69de", + "metadata": {}, + "outputs": [], + "source": [ + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week3/python_sql.ipynb b/week3/python_sql.ipynb new file mode 100644 index 0000000..cca3ec5 --- /dev/null +++ b/week3/python_sql.ipynb @@ -0,0 +1,912 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 43, + "id": "8ce928bf-26e0-4836-b5f5-b5a6cb48c1b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: duckdb in /Users/dmnavarro/.local/lib/python3.7/site-packages (0.10.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install duckdb" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "6026a2fa-f928-4f43-9d75-81439649c5e2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import duckdb" + ] + }, + { + "cell_type": "markdown", + "id": "5d888446-47ec-4828-902e-b3f55875af95", + "metadata": {}, + "source": [ + "Create a connection and a server" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "699fb802-81cf-438d-97cb-0c1244a1aea0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conn = duckdb.connect(\"database.db\") " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "a186b429-a944-4ecf-9514-e5848cd8520e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "9217dd9a-1a96-4bf0-8e77-09ed49f429f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cur = conn.cursor()" + ] + }, + { + "cell_type": "markdown", + "id": "d62899fa-37fd-4b5f-9953-ded1b4f2b0c7", + "metadata": {}, + "source": [ + "Now let's do something with our cursor " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "0f1ad8b3-4b45-4cc5-bc4e-38a71294f2f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Site LIMIT 5\")" + ] + }, + { + "cell_type": "markdown", + "id": "03d2cb6d-09ce-4c5f-ab45-85a4b6919434", + "metadata": {}, + "source": [ + "Now we want results..three ways of getting them:\n", + "1. All results at once" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "878b406e-2030-4bf5-ac98-475ff41c8fa4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('barr',\n", + " 'Barrow',\n", + " 'Alaska, USA',\n", + " 71.30000305175781,\n", + " -156.60000610351562,\n", + " 220.39999389648438),\n", + " ('burn',\n", + " 'Burntpoint Creek',\n", + " 'Ontario, Canada',\n", + " 55.20000076293945,\n", + " -84.30000305175781,\n", + " 63.0),\n", + " ('bylo',\n", + " 'Bylot Island',\n", + " 'Nunavut, Canada',\n", + " 73.19999694824219,\n", + " -80.0,\n", + " 723.5999755859375),\n", + " ('cakr',\n", + " 'Cape Krusenstern',\n", + " 'Alaska, USA',\n", + " 67.0999984741211,\n", + " -163.5,\n", + " 54.099998474121094),\n", + " ('cari',\n", + " 'Canning River Delta',\n", + " 'Alaska, USA',\n", + " 70.0999984741211,\n", + " -145.8000030517578,\n", + " 722.0)]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "dd80f837-5824-4019-8857-10549b0160b7", + "metadata": { + "tags": [] + }, + "source": [ + "Cursors don't have to store anything, they just transfer queries to the database and get results back. " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "fd4ad57d-0f83-4922-b30f-91068b6f3965", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "6980d7d7-3447-410b-a4b0-6f474230ead4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['14HPE1',\n", + " '11eaba',\n", + " '11eabaagc01',\n", + " '11eabaagv01',\n", + " '11eababbc02',\n", + " '11eababsv01',\n", + " '11eabaduh01',\n", + " '11eabaduv01',\n", + " '11eabarpc01',\n", + " '11eabarpc02']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT Nest_ID FROM Bird_nests LIMIT 10\")\n", + "[t[0] for t in cur.fetchall()]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "abeb04f7-187b-4cf8-971d-33627357ac62", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(1547,)]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchall() # tupple of one element" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "f838c66b-6085-413f-8b68-87b4837980ca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1547,)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# more convenient to say\n", + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchone()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "89abaa3c-e176-4eae-9fcf-6221208861e6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1547" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT COUNT(*) FROM Bird_nests\")\n", + "cur.fetchone()[0]" + ] + }, + { + "cell_type": "markdown", + "id": "7d2c30e4-23d1-4967-9c8d-c2aa7abf5b06", + "metadata": {}, + "source": [ + "3. using an iterator - but Duckdb doesn't support iterators :(" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "92c9e224-0850-4f8b-8f1d-40681a9bae4d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "got nest ID 14HPE1\n", + "got nest ID 11eaba\n", + "got nest ID 11eabaagc01\n", + "got nest ID 11eabaagv01\n", + "got nest ID 11eababbc02\n", + "got nest ID 11eababsv01\n", + "got nest ID 11eabaduh01\n", + "got nest ID 11eabaduv01\n", + "got nest ID 11eabarpc01\n", + "got nest ID 11eabarpc02\n" + ] + } + ], + "source": [ + "cur.execute(\"SELECT Nest_ID FROM Bird_nests LIMIT 10\") \n", + "while True: \n", + " row = cur.fetchone()\n", + " if row == None: \n", + " break \n", + " #do something with row \n", + " print(f\"got nest ID {row[0]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b6cc8397-e24d-497d-8094-7f2552d4081a", + "metadata": {}, + "source": [ + "Can do things other than SELECT! " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "469fa88d-be07-435e-ac6a-529a548d94ba", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"\"\"\n", + " CREATE TEMP TABLE temp_table AS \n", + " SELECT * FROM Bird_nests LIMIT 10 \n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "078d2c35-5f02-449b-a29c-5e9a10dd2835", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM temp_table\") " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "71c8be9e-c9bb-44d4-9a61-fb211a0fcde1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('b14.6',\n", + " 2014,\n", + " 'chur',\n", + " '14HPE1',\n", + " 'sepl',\n", + " 'vloverti',\n", + " datetime.date(2014, 6, 14),\n", + " None,\n", + " 3,\n", + " None,\n", + " None),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eaba',\n", + " 'wrsa',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 10),\n", + " 'searcher',\n", + " 4,\n", + " None,\n", + " None),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaagc01',\n", + " 'amgp',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 24),\n", + " 'searcher',\n", + " 4,\n", + " 6.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaagv01',\n", + " 'amgp',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 25),\n", + " 'searcher',\n", + " 3,\n", + " 3.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eababbc02',\n", + " 'bbpl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 24),\n", + " 'searcher',\n", + " 4,\n", + " 4.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eababsv01',\n", + " 'wrsa',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 7),\n", + " 'searcher',\n", + " 4,\n", + " 2.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaduh01',\n", + " 'dunl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 28),\n", + " 'searcher',\n", + " 3,\n", + " 2.0,\n", + " 'float'),\n", + " ('b11.6',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabaduv01',\n", + " 'dunl',\n", + " 'dkessler',\n", + " datetime.date(2011, 6, 29),\n", + " 'searcher',\n", + " 4,\n", + " 5.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabarpc01',\n", + " 'reph',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 8),\n", + " 'searcher',\n", + " 4,\n", + " 4.0,\n", + " 'float'),\n", + " ('b11.7',\n", + " 2011,\n", + " 'eaba',\n", + " '11eabarpc02',\n", + " 'reph',\n", + " 'bhill',\n", + " datetime.date(2011, 7, 8),\n", + " 'searcher',\n", + " 3,\n", + " 4.0,\n", + " 'float')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "bd254336-078a-4799-9193-f69815b1abb2", + "metadata": { + "tags": [] + }, + "source": [ + "A note on fragility\n", + "\n", + "For example: \n", + "```\n", + "INSERT INTO Site VALUES (\"abcd\", \"Foo\", 35.7, -119.5, \"?\") \n", + "```\n", + "\n", + "A less fragile way of expressing the same thing: \n", + "```\n", + "INSERT INTO Site (Code, Site_name, latitude, longitude, Something_else) \n", + " VALUES(\"abcd\", \"Foo\", 35.7, -119.5, \"?\")\n", + "```\n", + "\n", + "In the same vein: SELECT * is fragile: " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "8fac3c49-de9c-4bb0-b7e9-853e26ac7eed", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('barr',\n", + " 'Barrow',\n", + " 'Alaska, USA',\n", + " 71.30000305175781,\n", + " -156.60000610351562,\n", + " 220.39999389648438),\n", + " ('burn',\n", + " 'Burntpoint Creek',\n", + " 'Ontario, Canada',\n", + " 55.20000076293945,\n", + " -84.30000305175781,\n", + " 63.0),\n", + " ('bylo',\n", + " 'Bylot Island',\n", + " 'Nunavut, Canada',\n", + " 73.19999694824219,\n", + " -80.0,\n", + " 723.5999755859375)]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Site LIMIT 3\") \n", + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "97b42950-3a0b-4004-9f8b-6b5535460fc6", + "metadata": { + "tags": [] + }, + "source": [ + "A better, more robust way of coding to the same thing:" + ] + }, + { + "cell_type": "markdown", + "id": "84a964cc-8338-4f36-9073-8d91209f9815", + "metadata": { + "tags": [] + }, + "source": [ + "cur.execute(\"SELECT Site_name, Code, Latitude, Longitude FROM Site LIMIT 3\")\n", + "cur.fetchall()" + ] + }, + { + "cell_type": "markdown", + "id": "e10f7259-eb15-40dc-9493-a1813e384464", + "metadata": {}, + "source": [ + "An extended example: Question we're trying to answer: How many nests do we have for each species?\n", + "\n", + "Approach: first get all species. Then execute a count query for each species:" + ] + }, + { + "cell_type": "markdown", + "id": "6bd90210-2457-455c-a320-b72de7799f64", + "metadata": { + "tags": [] + }, + "source": [ + "A digression: string interpolation in Python " + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "21edde7e-20cb-45ad-b971-8118241bc2e9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Greg\n" + ] + } + ], + "source": [ + "s= \"My name is %s\" \n", + "print(s % \"Greg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "71cd6b6a-3a00-407e-b739-254ad09dfbe8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Species agsq has 0 nests\n", + "Species amcr has 0 nests\n", + "Species amgp has 29 nests\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + " SELECT COUNT (*) FROM Bird_nests\n", + " WHERE Species = '%s'\n", + "\"\"\"\n", + "# for each species, stuff in code from species\n", + "# use our cursor\n", + "cur.execute(\"SELECT Code FROM Species LIMIT 3\")\n", + "for row in cur.fetchall(): # duckdb workaround\n", + " code = row[0]\n", + " prepared_query = query % code\n", + " #print(prepared_query)\n", + " cur2 = conn.cursor()\n", + " cur2.execute(prepared_query)\n", + " print(f\"Species {code} has {cur2.fetchone()[0]} nests\")\n", + " cur2.close()" + ] + }, + { + "cell_type": "markdown", + "id": "24b9c9af-8449-4cdc-b0a2-a4fc3defb8c7", + "metadata": {}, + "source": [ + "The above python interpolation is dangerous and has cuased many database hacks! There's a better way: " + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "b13af3aa-fbca-43bb-ac0f-02f2b5e69856", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Species agsq has 0 nests\n", + "Species amcr has 0 nests\n", + "Species amgp has 29 nests\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + " SELECT COUNT (*) FROM Bird_nests\n", + " WHERE Species = ?\n", + "\"\"\"\n", + "# for each species, stuff in code from species\n", + "# use our cursor\n", + "cur.execute(\"SELECT Code FROM Species LIMIT 3\")\n", + "for row in cur.fetchall(): # duckdb workaround\n", + " code = row[0]\n", + " # NOT NEEDED! prepared_query = query % code\n", + " #print(prepared_query)\n", + " cur2 = conn.cursor()\n", + " cur2.execute(query, [code]) # <--- added argument here \n", + " print(f\"Species {code} has {cur2.fetchone()[0]} nests\")\n", + " cur2.close()" + ] + }, + { + "cell_type": "markdown", + "id": "79a93a87-19d2-4b2a-9d6b-6019b1f96f9e", + "metadata": {}, + "source": [ + "Let's illustrate the danger with a different example" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "eb1953d6-7d42-42ea-8c36-ea43b9c8f91e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abbrev = \"TS\" \n", + "name = \"Taylor Swift\" \n", + "cur.execute(\"\"\"\n", + " INSERT INTO Personnel (Abbreviation, Name) \n", + " VALUES ('%s', '%s') \n", + " \"\"\" % (abbrev, name) \n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a22c6c67-7c32-4880-a9eb-e1178dfb5d98", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('emagnuson', 'Emily Magnuson'),\n", + " ('mcorrell', 'Maureen Correll'),\n", + " ('TS', 'Taylor Swift')]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Personnel\") \n", + "cur.fetchall()[-3:]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "35cf6b07-fa4e-451f-bc46-aacef0e36961", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abbrev = \"CO\"\n", + "name = \"Conan O'Brian\"\n", + "cur.execute( \"\"\"\n", + " INSERT INTO Personnel (Abbreviation, Name) \n", + " VALUES(?,?)\n", + " \"\"\", \n", + " [abbrev, name])" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "78e38f60-a929-4bb4-b558-c720d9bad035", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('mcorrell', 'Maureen Correll'),\n", + " ('TS', 'Taylor Swift'),\n", + " ('CO', \"Conan O'Brian\")]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute(\"SELECT * FROM Personnel\") \n", + "cur.fetchall()[-3:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a0ee988-511e-43a6-8cb9-58fcdf0eb336", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week6/dbplyr.qmd b/week6/dbplyr.qmd new file mode 100644 index 0000000..a9e7b50 --- /dev/null +++ b/week6/dbplyr.qmd @@ -0,0 +1,91 @@ +#DUCKDB through R + +##Load in libraries +```{r, warning = F} +#Lets load all the libraries +library(tidyverse) +library(DBI) +library(dbplyr) +library(duckdb) +``` + + +##Connect to database + +```{r} +conn <- DBI::dbConnect(duckdb::duckdb(), dbdir = "../week3/database.db") +``` + +## Check out +```{r} +DBI::dbListTables(conn) +``` + +```{r} +species <- tbl(conn, "Species") +species +``` + +SQL Query + +```{r} +df <- dbGetQuery(conn, "SELECT * FROM Species WHERE Code LIKE 'a%'") +df +``` + +##working with 'dbplyr' + +This is read-only +```{r} +species %>% + filter(Relevance == "Study species") %>% + select(Code, Scientific_name) %>% + show_query() + +``` + +```{r} +species %>% + filter(Relevance == "Study species") %>% + select(Code, Scientific_name) %>% + collect() + +``` +Group By relevance and count the number of species in a group -> + +```{r} +species %>% + group_by(Relevance) %>% + summarize(num_species = n()) %>% + arrange(-num_species) %>% + collect() +``` +We can even mutate! + +We want to add an "X" in front of the Codes and update the column using a mutate: + +```{r} +new_species_code <- species %>% + mutate(Code = paste0("X", Code)) %>% + collect() + +``` + +We can do join through it as well! + +```{r} +eggs_db <- tbl(conn, "Bird_eggs") #qoute bc you need the dbi package +nests_db <- tbl(conn, "Bird_nests") +``` + +We want to use left join the next and the egg tables: + +```{r} +left_join(eggs_db, nests_db, by = c("Nest_ID", "Year", "Site", "Book_page")) %>% + show_query() +``` + +```{r} +DBI::dbDisconnect(conn, shutdown = TRUE) +``` +