From 40e9b94d78b6cb614aa8e7dc3b094e8652ab6ee0 Mon Sep 17 00:00:00 2001 From: ncclementi Date: Mon, 10 Jun 2024 13:08:02 -0400 Subject: [PATCH 1/5] refactor: remove ibis.NA --- .../index/execute-results/html.json | 9 +++-- docs/_quarto.yml | 4 -- docs/posts/campaign-finance/index.qmd | 2 +- docs/tutorials/ibis-for-pandas-users.qmd | 2 +- docs/tutorials/ibis-for-sql-users.qmd | 4 +- ibis/__init__.py | 21 +++++++++- .../clickhouse/tests/test_functions.py | 10 ++--- ibis/backends/dask/tests/test_window.py | 10 ++--- ibis/backends/impala/tests/test_case_exprs.py | 2 +- ibis/backends/impala/tests/test_exprs.py | 8 ++-- ibis/backends/pandas/tests/test_window.py | 10 ++--- .../backends/postgres/tests/test_functions.py | 22 +++++------ .../risingwave/tests/test_functions.py | 18 ++++----- ibis/backends/sqlite/tests/test_client.py | 2 +- ibis/backends/tests/sql/test_sql.py | 2 +- ibis/backends/tests/test_generic.py | 19 ++++++---- ibis/backends/tests/test_map.py | 4 +- ibis/backends/tests/test_string.py | 2 +- ibis/backends/tests/test_struct.py | 2 +- ibis/backends/tests/test_window.py | 2 +- ibis/expr/api.py | 30 --------------- ibis/tests/expr/test_table.py | 2 +- ibis/tests/expr/test_timestamp.py | 2 +- ibis/tests/expr/test_value_exprs.py | 38 +++++++++---------- ibis/tests/expr/test_window_frames.py | 4 +- 25 files changed, 110 insertions(+), 121 deletions(-) diff --git a/docs/_freeze/posts/campaign-finance/index/execute-results/html.json b/docs/_freeze/posts/campaign-finance/index/execute-results/html.json index 4da314f908f0..4f436cb38c6f 100644 --- a/docs/_freeze/posts/campaign-finance/index/execute-results/html.json +++ b/docs/_freeze/posts/campaign-finance/index/execute-results/html.json @@ -1,14 +1,15 @@ { - "hash": "2631514785c59e4e1d3b37b9c07ea232", + "hash": "989ed0f2ebddb8e202db6a33bc1bf790", "result": { - "markdown": "---\ntitle: \"Exploring campaign finance data\"\nauthor: \"Nick Crews\"\ndate: \"2023-03-24\"\ncategories:\n - blog\n - data engineering\n - case study\n - duckdb\n - performance\n---\n\nHi! My name is [Nick Crews](https://www.linkedin.com/in/nicholas-b-crews/),\nand I'm a data engineer that looks at public campaign finance data.\n\nIn this post, I'll walk through how I use Ibis to explore public campaign contribution\ndata from the Federal Election Commission (FEC). We'll do some loading,\ncleaning, featurizing, and visualization. There will be filtering, sorting, grouping,\nand aggregation.\n\n## Downloading The Data\n\n::: {#02d63441 .cell execution_count=1}\n``` {.python .cell-code}\nfrom pathlib import Path\nfrom zipfile import ZipFile\nfrom urllib.request import urlretrieve\n\n# Download and unzip the 2018 individual contributions data\nurl = \"https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2018/indiv18.zip\"\nzip_path = Path(\"indiv18.zip\")\ncsv_path = Path(\"indiv18.csv\")\n\nif not zip_path.exists():\n urlretrieve(url, zip_path)\n\nif not csv_path.exists():\n with ZipFile(zip_path) as zip_file, csv_path.open(\"w\") as csv_file:\n for line in zip_file.open(\"itcont.txt\"):\n csv_file.write(line.decode())\n```\n:::\n\n\n## Loading the data\n\nNow that we have our raw data in a .csv format, let's load it into Ibis,\nusing the duckdb backend.\n\nNote that a 4.3 GB .csv would be near the limit of what pandas could\nhandle on my laptop with 16GB of RAM. In pandas, typically every time\nyou perform a transformation on the data, a copy of the data is made.\nI could only do a few transformations before I ran out of memory.\n\nWith Ibis, this problem is solved in two different ways.\n\nFirst, because they are designed to work with very large datasets,\nmany (all?) SQL backends support out of core operations.\nThe data lives on disk, and are only loaded in a streaming fashion\nwhen needed, and then written back to disk as the operation is performed.\n\nSecond, unless you explicitly ask for it, Ibis makes use of lazy\nevaluation. This means that when you ask for a result, the\nresult is not persisted in memory. Only the original source\ndata is persisted. Everything else is derived from this on the fly.\n\n::: {#83a871f2 .cell execution_count=2}\n``` {.python .cell-code}\nimport ibis\nfrom ibis import _\n\nibis.options.interactive = True\n\n# The raw .csv file doesn't have column names, so we will add them in the next step.\nraw = ibis.read_csv(csv_path)\nraw\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ C00401224  A       M6      P       201804059101866001  24T     IND     STOUFFER, LEIGH    AMSTELVEEN    ZZ      1187RC     MYSELF             SELF EMPLOYED            05172017  10     C00458000  SA11AI_81445687  1217152  column18  EARMARKED FOR PROGRESSIVE CHANGE CAMPAIGN COMMITTEE (C00458000)  4050820181544765358 ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringint64stringstringstringstringstringstringstringstringstringint64stringstringint64stringstringint64               │\n├───────────┼────────┼────────┼────────┼────────────────────┼────────┼────────┼───────────────────┼──────────────┼────────┼───────────┼───────────────────┼─────────────────────────┼──────────┼───────┼───────────┼─────────────────┼─────────┼──────────┼─────────────────────────────────────────────────────────────────┼─────────────────────┤\n│ C00401224A     M6    P     20180405910186774824T   IND   STRAWS, JOYCE    OCOEE       FL    34761    SILVERSEA CRUISESRESERVATIONS SUPERVISOR0518201710C00000935SA11AI_815923361217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544770597 │\n│ C00401224A     M6    P     20180405910186774824T   IND   STRAWS, JOYCE    OCOEE       FL    34761    SILVERSEA CRUISESRESERVATIONS SUPERVISOR0519201715C00000935SA11AI_816275621217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544770598 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0513201735C00000935SA11AI_810479211217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765179 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0515201735C00000935SA11AI_812092091217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765180 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   051920175C00000935SA11AI_816052231217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765181 │\n│ C00401224A     M6    P     20180405910186594324T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0524201715C00000935SA11AI_822000221217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765182 │\n│ C00401224A     M6    P     20180405910186594324T   IND   STOTT, JIM       CAPE NEDDICKME    03902    NOT EMPLOYED     NOT EMPLOYED           05292017100C00213512SA11AI_825898341217152NULLEARMARKED FOR NANCY PELOSI FOR CONGRESS (C00213512)            4050820181544765184 │\n│ C00401224A     M6    P     20180405910186594424T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0530201735C00000935SA11AI_826437271217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765185 │\n│ C00401224A     M6    P     20180405910186705024T   IND   STRANGE, WINIFREDANNA MSRIA  FL    34216    NOT EMPLOYED     NOT EMPLOYED           0516201725C00000935SA11AI_813259181217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544768505 │\n│ C00401224A     M6    P     20180405910186705124T   IND   STRANGE, WINIFREDANNA MSRIA  FL    34216    NOT EMPLOYED     NOT EMPLOYED           0523201725C00000935SA11AI_819911891217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544768506 │\n│  │\n└───────────┴────────┴────────┴────────┴────────────────────┴────────┴────────┴───────────────────┴──────────────┴────────┴───────────┴───────────────────┴─────────────────────────┴──────────┴───────┴───────────┴─────────────────┴─────────┴──────────┴─────────────────────────────────────────────────────────────────┴─────────────────────┘\n
\n```\n:::\n:::\n\n\n::: {#d2a81789 .cell execution_count=3}\n``` {.python .cell-code}\n# For a more comprehesive description of the columns and their meaning, see\n# https://www.fec.gov/campaign-finance-data/contributions-individuals-file-description/\ncolumns = {\n \"CMTE_ID\": \"keep\", # Committee ID\n \"AMNDT_IND\": \"drop\", # Amendment indicator. A = amendment, N = new, T = termination\n \"RPT_TP\": \"drop\", # Report type (monthly, quarterly, etc)\n \"TRANSACTION_PGI\": \"keep\", # Primary/general indicator\n \"IMAGE_NUM\": \"drop\", # Image number\n \"TRANSACTION_TP\": \"drop\", # Transaction type\n \"ENTITY_TP\": \"keep\", # Entity type\n \"NAME\": \"drop\", # Contributor name\n \"CITY\": \"keep\", # Contributor city\n \"STATE\": \"keep\", # Contributor state\n \"ZIP_CODE\": \"drop\", # Contributor zip code\n \"EMPLOYER\": \"drop\", # Contributor employer\n \"OCCUPATION\": \"drop\", # Contributor occupation\n \"TRANSACTION_DT\": \"keep\", # Transaction date\n \"TRANSACTION_AMT\": \"keep\", # Transaction amount\n # Other ID. For individual contributions will be null. For contributions from\n # other FEC committees, will be the committee ID of the other committee.\n \"OTHER_ID\": \"drop\",\n \"TRAN_ID\": \"drop\", # Transaction ID\n \"FILE_NUM\": \"drop\", # File number, unique number assigned to each report filed with the FEC\n \"MEMO_CD\": \"drop\", # Memo code\n \"MEMO_TEXT\": \"drop\", # Memo text\n \"SUB_ID\": \"drop\", # Submission ID. Unique number assigned to each transaction.\n}\n\nrenaming = {old: new for old, new in zip(raw.columns, columns.keys())}\nto_keep = [k for k, v in columns.items() if v == \"keep\"]\nkept = raw.relabel(renaming)[to_keep]\nkept\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    TRANSACTION_PGI  ENTITY_TP  CITY          STATE   TRANSACTION_DT  TRANSACTION_AMT ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringstringint64           │\n├───────────┼─────────────────┼───────────┼──────────────┼────────┼────────────────┼─────────────────┤\n│ C00401224P              IND      OCOEE       FL    05182017      10 │\n│ C00401224P              IND      OCOEE       FL    05192017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05132017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05152017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05192017      5 │\n│ C00401224P              IND      CAPE NEDDICKME    05242017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05292017      100 │\n│ C00401224P              IND      CAPE NEDDICKME    05302017      35 │\n│ C00401224P              IND      ANNA MSRIA  FL    05162017      25 │\n│ C00401224P              IND      ANNA MSRIA  FL    05232017      25 │\n│  │\n└───────────┴─────────────────┴───────────┴──────────────┴────────┴────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\n::: {#1e6d16fe .cell execution_count=4}\n``` {.python .cell-code}\n# 21 million rows\nkept.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=4}\n\n::: {.ansi-escaped-output}\n```{=html}\n
21730730
\n```\n:::\n\n:::\n:::\n\n\nHuh, what's up with those timings? Previewing the head only took a fraction of a second,\nbut finding the number of rows took 10 seconds.\n\nThat's because duckdb is scanning the .csv file on the fly every time we access it.\nSo we only have to read the first few lines to get that preview,\nbut we have to read the whole file to get the number of rows.\n\nNote that this isn't a feature of Ibis, but a feature of Duckdb. This what I think is\none of the strengths of Ibis: Ibis itself doesn't have to implement any of the\noptimimizations or features of the backends. Those backends can focus on what they do\nbest, and Ibis can get those things for free.\n\nSo, let's tell duckdb to actually read in the file to its native format so later accesses\nwill be faster. This will be a ~20 seconds that we'll only have to pay once.\n\n::: {#185a2d89 .cell execution_count=5}\n``` {.python .cell-code}\nkept = kept.cache()\nkept\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    TRANSACTION_PGI  ENTITY_TP  CITY          STATE   TRANSACTION_DT  TRANSACTION_AMT ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringstringint64           │\n├───────────┼─────────────────┼───────────┼──────────────┼────────┼────────────────┼─────────────────┤\n│ C00401224P              IND      OCOEE       FL    05182017      10 │\n│ C00401224P              IND      OCOEE       FL    05192017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05132017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05152017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05192017      5 │\n│ C00401224P              IND      CAPE NEDDICKME    05242017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05292017      100 │\n│ C00401224P              IND      CAPE NEDDICKME    05302017      35 │\n│ C00401224P              IND      ANNA MSRIA  FL    05162017      25 │\n│ C00401224P              IND      ANNA MSRIA  FL    05232017      25 │\n│  │\n└───────────┴─────────────────┴───────────┴──────────────┴────────┴────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\nLook, now accessing it only takes a fraction of a second!\n\n::: {#9253e73f .cell execution_count=6}\n``` {.python .cell-code}\nkept.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n\n::: {.ansi-escaped-output}\n```{=html}\n
21730730
\n```\n:::\n\n:::\n:::\n\n\n### Committees Data\n\nThe contributions only list an opaque `CMTE_ID` column. We want to know which actual\ncommittee this is. Let's load the committees table so we can lookup from\ncommittee ID to committee name.\n\n::: {#30076e2c .cell execution_count=7}\n``` {.python .cell-code}\ndef read_committees():\n committees_url = \"https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2018/committee_summary_2018.csv\"\n # This just creates a view, it doesn't actually fetch the data yet\n tmp = ibis.read_csv(committees_url)\n tmp = tmp[\"CMTE_ID\", \"CMTE_NM\"]\n # The raw table contains multiple rows for each committee id, so lets pick\n # an arbitrary row for each committee id as the representative name.\n deduped = tmp.group_by(\"CMTE_ID\").agg(CMTE_NM=_.CMTE_NM.arbitrary())\n return deduped\n\n\ncomms = read_committees().cache()\ncomms\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    CMTE_NM                                                        ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstring                                                         │\n├───────────┼────────────────────────────────────────────────────────────────┤\n│ C00659441JASON ORTITAY FOR CONGRESS                                     │\n│ C00661249SERVICE AFTER SERVICE                                          │\n│ C00457754U.S. TRAVEL ASSOCIATION PAC                                    │\n│ C00577635ISAKSON VICTORY COMMITTEE                                      │\n│ C00297911TEXAS FORESTRY ASSOCIATION FORESTRY POLITICAL ACTION COMMITTEE │\n│ C00551382VOTECLIMATE.US PAC                                             │\n│ C00414318LOEBSACK FOR CONGRESS                                          │\n│ C00610709AUSTIN INNOVATION 2016                                         │\n│ C00131607FLORIDA CITRUS MUTUAL POLITCAL ACTION COMMITTEE                │\n│ C00136531NATIONAL DEMOCRATIC POLICY COMMITTEE                           │\n│                                                               │\n└───────────┴────────────────────────────────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\nNow add the committee name to the contributions table:\n\n::: {#0a9f3b35 .cell execution_count=8}\n``` {.python .cell-code}\ntogether = kept.left_join(comms, \"CMTE_ID\").drop(\"CMTE_ID\", \"CMTE_ID_right\")\ntogether\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  ENTITY_TP  CITY              STATE   TRANSACTION_DT  TRANSACTION_AMT  CMTE_NM                                         ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringint64string                                          │\n├─────────────────┼───────────┼──────────────────┼────────┼────────────────┼─────────────────┼─────────────────────────────────────────────────┤\n│ P              IND      COHASSET        MA    01312017      230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      KEY LARGO       FL    01042017      5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      LOOKOUT MOUNTAINGA    01312017      230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      NORTH YARMOUTH  ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      ALPHARETTA      GA    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      HOLLIS CENTER   ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      ALEXANDRIA      VA    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│                                                │\n└─────────────────┴───────────┴──────────────────┴────────┴────────────────┴─────────────────┴─────────────────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\n## Cleaning\n\nFirst, let's drop any contributions that don't have a committee name. There are only 6 of them.\n\n::: {#14ae871f .cell execution_count=9}\n``` {.python .cell-code}\n# We can do this fearlessly, no .copy() needed, because\n# everything in Ibis is immutable. If we did this in pandas,\n# we might start modifying the original DataFrame accidentally!\ncleaned = together\n\nhas_name = cleaned.CMTE_NM.notnull()\ncleaned = cleaned[has_name]\nhas_name.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ NotNull(CMTE_NM)  NotNull(CMTE_NM)_count ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ booleanint64                  │\n├──────────────────┼────────────────────────┤\n│ True             │               21730724 │\n│ False            │                      6 │\n└──────────────────┴────────────────────────┘\n
\n```\n:::\n:::\n\n\nLet's look at the `ENTITY_TP` column. This represents the type of entity that\nmade the contribution:\n\n::: {#72577ed8 .cell execution_count=10}\n``` {.python .cell-code}\ntogether.ENTITY_TP.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ ENTITY_TP  ENTITY_TP_count ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringint64           │\n├───────────┼─────────────────┤\n│ IND      21687992 │\n│ CCM      698 │\n│ CAN      13659 │\n│ ORG      18555 │\n│ PTY      49 │\n│ COM      867 │\n│ PAC      3621 │\n│ NULL5289 │\n└───────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\nWe only care about contributions from individuals.\n\nOnce we filter on this column, the contents of it are irrelevant, so let's drop it.\n\n::: {#f29924a2 .cell execution_count=11}\n``` {.python .cell-code}\ncleaned = together[_.ENTITY_TP == \"IND\"].drop(\"ENTITY_TP\")\n```\n:::\n\n\nIt looks like the `TRANSACTION_DT` column was a raw string like \"MMDDYYYY\",\nso let's convert that to a proper date type.\n\n::: {#15443483 .cell execution_count=12}\n``` {.python .cell-code}\nfrom ibis.expr.types import StringValue, DateValue\n\n\ndef mmddyyyy_to_date(val: StringValue) -> DateValue:\n return val.cast(str).lpad(8, \"0\").to_timestamp(\"%m%d%Y\").date()\n\n\ncleaned = cleaned.mutate(date=mmddyyyy_to_date(_.TRANSACTION_DT)).drop(\"TRANSACTION_DT\")\ncleaned\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  CITY              STATE   TRANSACTION_AMT  CMTE_NM                                          date       ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n│ stringstringstringint64stringdate       │\n├─────────────────┼──────────────────┼────────┼─────────────────┼─────────────────────────────────────────────────┼────────────┤\n│ P              COHASSET        MA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              KEY LARGO       FL    5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-04 │\n│ P              LOOKOUT MOUNTAINGA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              NORTH YARMOUTH  ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              ALPHARETTA      GA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              HOLLIS CENTER   ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              ALEXANDRIA      VA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│           │\n└─────────────────┴──────────────────┴────────┴─────────────────┴─────────────────────────────────────────────────┴────────────┘\n
\n```\n:::\n:::\n\n\nThe `TRANSACTION_PGI` column represents the type (primary, general, etc) of election,\nand the year. But it seems to be not very consistent:\n\n::: {#fa016097 .cell execution_count=13}\n``` {.python .cell-code}\ncleaned.TRANSACTION_PGI.topk(10)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  Count(TRANSACTION_PGI) ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringint64                  │\n├─────────────────┼────────────────────────┤\n│ P              17013596 │\n│ G2018          2095123 │\n│ P2018          1677183 │\n│ P2020          208501 │\n│ O2018          161874 │\n│ S2017          124336 │\n│ G2017          98401 │\n│ P2022          91136 │\n│ P2017          61153 │\n│ R2017          54281 │\n└─────────────────┴────────────────────────┘\n
\n```\n:::\n:::\n\n\n::: {#35c8a393 .cell execution_count=14}\n``` {.python .cell-code}\ndef get_election_type(pgi: StringValue) -> StringValue:\n \"\"\"Use the first letter of the TRANSACTION_PGI column to determine the election type\n\n If the first letter is not one of the known election stage, then return null.\n \"\"\"\n election_types = {\n \"P\": \"primary\",\n \"G\": \"general\",\n \"O\": \"other\",\n \"C\": \"convention\",\n \"R\": \"runoff\",\n \"S\": \"special\",\n \"E\": \"recount\",\n }\n first_letter = pgi[0]\n return first_letter.substitute(election_types, else_=ibis.NA)\n\n\ncleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop(\n \"TRANSACTION_PGI\"\n)\ncleaned\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CITY              STATE   TRANSACTION_AMT  CMTE_NM                                          date        election_type ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64stringdatestring        │\n├──────────────────┼────────┼─────────────────┼─────────────────────────────────────────────────┼────────────┼───────────────┤\n│ COHASSET        MA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ KEY LARGO       FL    5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-04primary       │\n│ LOOKOUT MOUNTAINGA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ NORTH YARMOUTH  ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ ALPHARETTA      GA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ HOLLIS CENTER   ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│ ALEXANDRIA      VA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary       │\n│              │\n└──────────────────┴────────┴─────────────────┴─────────────────────────────────────────────────┴────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\nThat worked well! There are 0 nulls in the resulting column, so we always were\nable to determine the election type.\n\n::: {#e7038c36 .cell execution_count=15}\n``` {.python .cell-code}\ncleaned.election_type.topk(10)\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓\n┃ election_type  Count(election_type) ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringint64                │\n├───────────────┼──────────────────────┤\n│ primary      19061953 │\n│ general      2216685 │\n│ other        161965 │\n│ special      149572 │\n│ runoff       69637 │\n│ convention   22453 │\n│ recount      5063 │\n│ NULL0 │\n└───────────────┴──────────────────────┘\n
\n```\n:::\n:::\n\n\nAbout 1/20 of transactions are negative. These could represent refunds, or they\ncould be data entry errors. Let's drop them to keep it simple.\n\n::: {#ab64b9b2 .cell execution_count=16}\n``` {.python .cell-code}\nabove_zero = cleaned.TRANSACTION_AMT > 0\ncleaned = cleaned[above_zero]\nabove_zero.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=16}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ Greater(TRANSACTION_AMT, 0)  Greater(TRANSACTION_AMT, 0)_count ┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ booleanint64                             │\n├─────────────────────────────┼───────────────────────────────────┤\n│ True                        │                          20669809 │\n│ False                       │                           1018183 │\n└─────────────────────────────┴───────────────────────────────────┘\n
\n```\n:::\n:::\n\n\n## Adding Features\n\nNow that the data is cleaned up to a usable format, let's add some features.\n\nFirst, it's useful to categorize donations by size, placing them into buckets\nof small, medium, large, etc.\n\n::: {#db1e9cbe .cell execution_count=17}\n``` {.python .cell-code}\nedges = [\n 10,\n 50,\n 100,\n 500,\n 1000,\n 5000,\n]\nlabels = [\n \"<10\",\n \"10-50\",\n \"50-100\",\n \"100-500\",\n \"500-1000\",\n \"1000-5000\",\n \"5000+\",\n]\n\n\ndef bucketize(vals, edges, str_labels):\n # Uses Ibis's .bucket() method to create a categorical column\n int_labels = vals.bucket(edges, include_under=True, include_over=True)\n # Map the integer labels to the string labels\n int_to_str = {str(i): s for i, s in enumerate(str_labels)}\n return int_labels.cast(str).substitute(int_to_str)\n\n\nfeatured = cleaned.mutate(amount_bucket=bucketize(_.TRANSACTION_AMT, edges, labels))\nfeatured\n```\n\n::: {.cell-output .cell-output-display execution_count=17}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CITY              STATE   TRANSACTION_AMT  CMTE_NM                                          date        election_type  amount_bucket ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64stringdatestringstring        │\n├──────────────────┼────────┼─────────────────┼─────────────────────────────────────────────────┼────────────┼───────────────┼───────────────┤\n│ COHASSET        MA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ KEY LARGO       FL    5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-04primary      1000-5000     │\n│ LOOKOUT MOUNTAINGA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ NORTH YARMOUTH  ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ ALPHARETTA      GA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ HOLLIS CENTER   ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│ ALEXANDRIA      VA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31primary      100-500       │\n│              │\n└──────────────────┴────────┴─────────────────┴─────────────────────────────────────────────────┴────────────┴───────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n## Analysis\n\n### By donation size\n\nOne thing we can look at is the donation breakdown by size:\n- Are most donations small or large?\n- Where do politicians/committees get most of their money from? Large or small donations?\n\nWe also will compare performance of Ibis vs pandas during this groupby.\n\n::: {#2c306d0f .cell execution_count=18}\n``` {.python .cell-code}\ndef summary_by(table, by):\n return table.group_by(by).agg(\n n_donations=_.count(),\n total_amount=_.TRANSACTION_AMT.sum(),\n mean_amount=_.TRANSACTION_AMT.mean(),\n median_amount=_.TRANSACTION_AMT.approx_median(),\n )\n\n\ndef summary_by_pandas(df, by):\n return df.groupby(by, as_index=False).agg(\n n_donations=(\"election_type\", \"count\"),\n total_amount=(\"TRANSACTION_AMT\", \"sum\"),\n mean_amount=(\"TRANSACTION_AMT\", \"mean\"),\n median_amount=(\"TRANSACTION_AMT\", \"median\"),\n )\n\n\n# persist the input data so the following timings of the group_by are accurate.\nsubset = featured[\"election_type\", \"amount_bucket\", \"TRANSACTION_AMT\"]\nsubset = subset.cache()\npandas_subset = subset.execute()\n```\n:::\n\n\nLet's take a look at what we are actually computing:\n\n::: {#a621ca5f .cell execution_count=19}\n``` {.python .cell-code}\nby_type_and_bucket = summary_by(subset, [\"election_type\", \"amount_bucket\"])\nby_type_and_bucket\n```\n\n::: {.cell-output .cell-output-display execution_count=19}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ election_type  amount_bucket  n_donations  total_amount  mean_amount   median_amount ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64int64float64int64         │\n├───────────────┼───────────────┼─────────────┼──────────────┼──────────────┼───────────────┤\n│ primary      50-100       266393315542654058.34476350 │\n│ primary      10-50        811540318766625123.12469925 │\n│ primary      100-500      3636287637353634175.275943150 │\n│ primary      <10          2423728100807214.1591805 │\n│ primary      500-1000     634677334630687527.245649500 │\n│ primary      1000-5000    68475512313948741798.2999381008 │\n│ primary      5000+        44085155837111635349.23706510000 │\n│ general      100-500      700821123174568175.757530150 │\n│ general      50-100       3043631618431253.17437450 │\n│ general      10-50        6607871441158821.80973325 │\n│  │\n└───────────────┴───────────────┴─────────────┴──────────────┴──────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\nOK, now let's do our timings.\n\nOne interesting thing to pay attention to here is the execution time for the following\ngroupby. Before, we could get away with lazy execution: because we only wanted to preview\nthe first few rows, we only had to compute the first few rows, so all our previews were\nvery fast.\n\nBut now, as soon as we do a groupby, we have to actually go through the whole dataset\nin order to compute the aggregate per group. So this is going to be slower. BUT,\nduckdb is still quite fast. It only takes milliseconds to groupby-agg all 20 million rows!\n\n::: {#fc3694c3 .cell execution_count=20}\n``` {.python .cell-code}\n%timeit summary_by(subset, [\"election_type\", \"amount_bucket\"]).execute() # .execute() so we actually fetch the data\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n679 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n```\n:::\n:::\n\n\nNow let's try the same thing in pandas:\n\n::: {#ab990661 .cell execution_count=21}\n``` {.python .cell-code}\n%timeit summary_by_pandas(pandas_subset, [\"election_type\", \"amount_bucket\"])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n3.59 s ± 31.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n```\n:::\n:::\n\n\nIt takes about 4 seconds, which is about 10 times slower than duckdb.\nAt this scale, it again doesn't matter,\nbut you could imagine with a dataset much larger than this, it would matter.\n\nLet's also think about memory usage:\n\n::: {#03834f0b .cell execution_count=22}\n``` {.python .cell-code}\npandas_subset.memory_usage(deep=True).sum() / 1e9 # GB\n```\n\n::: {.cell-output .cell-output-display execution_count=22}\n```\n2.782586663\n```\n:::\n:::\n\n\nThe source dataframe is couple gigabytes, so probably during the groupby,\nthe peak memory usage is going to be a bit higher than this. You could use a profiler\nsuch as [FIL](https://github.com/pythonspeed/filprofiler) if you wanted an exact number,\nI was too lazy to use that here.\n\nAgain, this works on my laptop at this dataset size, but much larger than this and I'd\nstart having problems. Duckdb on the other hand is designed around working out of core\nso it should scale to datasets into the hundreds of gigabytes, much larger than your\ncomputer's RAM.\n\n### Back to analysis\n\nOK, let's plot the result of that groupby.\n\nSurprise! (Or maybe not...) Most donations are small. But most of the money comes\nfrom donations larger than $1000.\n\nWell if that's the case, why do politicians spend so much time soliciting small\ndonations? One explanation is that they can use the number of donations\nas a marketing pitch, to show how popular they are, and thus how viable of a\ncandidate they are.\n\nThis also might explain whose interests are being served by our politicians.\n\n::: {#cf2c035e .cell execution_count=23}\n``` {.python .cell-code}\nimport altair as alt\n\n# Do some bookkeeping so the buckets are displayed smallest to largest on the charts\nbucket_col = alt.Column(\"amount_bucket:N\", sort=labels)\n\nn_by_bucket = (\n alt.Chart(by_type_and_bucket.execute())\n .mark_bar()\n .encode(\n x=bucket_col,\n y=\"n_donations:Q\",\n color=\"election_type:N\",\n )\n)\ntotal_by_bucket = (\n alt.Chart(by_type_and_bucket.execute())\n .mark_bar()\n .encode(\n x=bucket_col,\n y=\"total_amount:Q\",\n color=\"election_type:N\",\n )\n)\nn_by_bucket | total_by_bucket\n```\n\n::: {.cell-output .cell-output-display execution_count=23}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By election stage\n\nLet's look at how donations break down by election stage. Do people donate\ndifferently for primary elections vs general elections?\n\nLet's ignore everything but primary and general elections, since they are the\nmost common, and arguably the most important.\n\n::: {#92651642 .cell execution_count=24}\n``` {.python .cell-code}\ngb2 = by_type_and_bucket[_.election_type.isin((\"primary\", \"general\"))]\nn_donations_per_election_type = _.n_donations.sum().over(group_by=\"election_type\")\nfrac = _.n_donations / n_donations_per_election_type\ngb2 = gb2.mutate(frac_n_donations_per_election_type=frac)\ngb2\n```\n\n::: {.cell-output .cell-output-display execution_count=24}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ election_type  amount_bucket  n_donations  total_amount  mean_amount   median_amount  frac_n_donations_per_election_type ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringint64int64float64int64float64                            │\n├───────────────┼───────────────┼─────────────┼──────────────┼──────────────┼───────────────┼────────────────────────────────────┤\n│ primary      10-50        811540318766625123.124699250.445831 │\n│ primary      <10          2423728100807214.15918050.133151 │\n│ primary      100-500      3636287637353634175.2759431500.199765 │\n│ primary      50-100       266393315542654058.344763500.146347 │\n│ primary      500-1000     634677334630687527.2456495000.034867 │\n│ primary      1000-5000    68475512313948741798.29993810080.037618 │\n│ primary      5000+        44085155837111635349.237065100000.002422 │\n│ general      50-100       3043631618431253.174374500.138017 │\n│ general      100-500      700821123174568175.7575301500.317796 │\n│ general      500-1000     17418291015697522.5321625000.078985 │\n│  │\n└───────────────┴───────────────┴─────────────┴──────────────┴──────────────┴───────────────┴────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\nIt looks like primary elections get a larger proportion of small donations.\n\n::: {#fd42d9bf .cell execution_count=25}\n``` {.python .cell-code}\nalt.Chart(gb2.execute()).mark_bar().encode(\n x=\"election_type:O\",\n y=\"frac_n_donations_per_election_type:Q\",\n color=bucket_col,\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=25}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By recipient\n\nLet's look at the top players. Who gets the most donations?\n\nFar and away it is ActBlue, which acts as a conduit for donations to Democratic\ninterests.\n\nBeto O'Rourke is the top individual politician, hats off to him!\n\n::: {#e844f42e .cell execution_count=26}\n``` {.python .cell-code}\nby_recip = summary_by(featured, \"CMTE_NM\")\nby_recip\n```\n\n::: {.cell-output .cell-output-display execution_count=26}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CMTE_NM                                                                           n_donations  total_amount  mean_amount  median_amount ┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringint64int64float64int64         │\n├──────────────────────────────────────────────────────────────────────────────────┼─────────────┼──────────────┼─────────────┼───────────────┤\n│ EXELON CORPORATION POLITICAL ACTION COMMITTEE (EXELON PAC)                      132501939503146.377585118 │\n│ ARCHER DANIELS MIDLAND COMPANY-ADM PAC                                          446027580761.84013525 │\n│ PFIZER INC. PAC                                                                 46900194868941.54987220 │\n│ SUEZ WATER INC. FEDERAL PAC                                                     10816873156.231481120 │\n│ INTERNATIONAL WAREHOUSE LOGISTICS ASSOCIATION PAC                               901322001468.8888891000 │\n│ BAKERY, CONFECTIONERY, TOBACCO WORKERS AND GRAIN MILLERS INTERNATIONAL UNION PAC3871909149.33074930 │\n│ UNION PACIFIC CORP. FUND FOR EFFECTIVE GOVERNMENT                               161182436963151.195123114 │\n│ NATIONAL ASSOCIATION OF REALTORS POLITICAL ACTION COMMITTEE                     242775492063226.224945154 │\n│ AMERICAN FINANCIAL SERVICES ASSOCIATION PAC                                     690685839993.96956565 │\n│ WEYERHAEUSER COMPANY POLITICAL ACTION COMMITTEE                                 551234324462.27213430 │\n│  │\n└──────────────────────────────────────────────────────────────────────────────────┴─────────────┴──────────────┴─────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n::: {#a0c1efd8 .cell execution_count=27}\n``` {.python .cell-code}\ntop_recip = by_recip.order_by(ibis.desc(\"n_donations\")).head(10)\nalt.Chart(top_recip.execute()).mark_bar().encode(\n x=alt.X(\"CMTE_NM:O\", sort=\"-y\"),\n y=\"n_donations:Q\",\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=27}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By Location\n\nWhere are the largest donations coming from?\n\n::: {#3348eca1 .cell execution_count=28}\n``` {.python .cell-code}\nf2 = featured.mutate(loc=_.CITY + \", \" + _.STATE).drop(\"CITY\", \"STATE\")\nby_loc = summary_by(f2, \"loc\")\n# Drop the places with a small number of donations so we're\n# resistant to outliers for the mean\nby_loc = by_loc[_.n_donations > 1000]\nby_loc\n```\n\n::: {.cell-output .cell-output-display execution_count=28}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ loc               n_donations  total_amount  mean_amount  median_amount ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringint64int64float64int64         │\n├──────────────────┼─────────────┼──────────────┼─────────────┼───────────────┤\n│ DALLAS, TX      15403866558403432.09080258 │\n│ PHILADELPHIA, PA22293836054977161.72647662 │\n│ MALIBU, CA      116994934763421.81066850 │\n│ SANTEE, CA      245420127482.01874526 │\n│ WINNETKA, IL    85895621809654.535918172 │\n│ OREM, UT        2110837475396.90758350 │\n│ MESA, AZ        22128185663683.90437520 │\n│ WAYZATA, MN     64883326275512.681104117 │\n│ MINNETONKA, MN  57091187881208.07164150 │\n│ OJAI, CA        4496926422206.05471525 │\n│  │\n└──────────────────┴─────────────┴──────────────┴─────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n::: {#95c93760 .cell execution_count=29}\n``` {.python .cell-code}\ndef top_by(col):\n top = by_loc.order_by(ibis.desc(col)).head(10)\n return (\n alt.Chart(top.execute())\n .mark_bar()\n .encode(\n x=alt.X('loc:O', sort=\"-y\"),\n y=col,\n )\n )\n\n\ntop_by(\"n_donations\") | top_by(\"total_amount\") | top_by(\"mean_amount\") | top_by(\n \"median_amount\"\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=29}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By month\n\nWhen do the donations come in?\n\n::: {#6d0776d2 .cell execution_count=30}\n``` {.python .cell-code}\nby_month = summary_by(featured, _.date.month().name(\"month_int\"))\n# Sorta hacky, .substritute doesn't work to change dtypes (yet?)\n# so we cast to string and then do our mapping\nmonth_map = {\n \"1\": \"Jan\",\n \"2\": \"Feb\",\n \"3\": \"Mar\",\n \"4\": \"Apr\",\n \"5\": \"May\",\n \"6\": \"Jun\",\n \"7\": \"Jul\",\n \"8\": \"Aug\",\n \"9\": \"Sep\",\n \"10\": \"Oct\",\n \"11\": \"Nov\",\n \"12\": \"Dec\",\n}\nby_month = by_month.mutate(month_str=_.month_int.cast(str).substitute(month_map))\nby_month\n```\n\n::: {.cell-output .cell-output-display execution_count=30}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓\n┃ month_int  n_donations  total_amount  mean_amount  median_amount  month_str ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩\n│ int32int64int64float64int64string    │\n├───────────┼─────────────┼──────────────┼─────────────┼───────────────┼───────────┤\n│      NULL1514250297165.32166499NULL      │\n│         1348979174837854500.998209122Jan       │\n│         2581646255997655440.126219100Feb       │\n│         31042577430906797413.30932681Mar       │\n│         41088244299252692274.98676050Apr       │\n│         51374247387317192281.83957648May       │\n│         61667285465305247279.07961044Jun       │\n│         71607053320528605199.45117235Jul       │\n│         82023466473544182234.02626135Aug       │\n│         92583847697888624270.09672938Sep       │\n│                  │\n└───────────┴─────────────┴──────────────┴─────────────┴───────────────┴───────────┘\n
\n```\n:::\n:::\n\n\n::: {#a2b27c61 .cell execution_count=31}\n``` {.python .cell-code}\nmonths_in_order = list(month_map.values())\nalt.Chart(by_month.execute()).mark_bar().encode(\n x=alt.X(\"month_str:O\", sort=months_in_order),\n y=\"n_donations:Q\",\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=31}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n## Conclusion\n\nThanks for following along! I hope you've learned something about Ibis, and\nmaybe even about campaign finance.\n\nIbis is a great tool for exploring data. I now find myself reaching for it\nwhen in the past I would have reached for pandas.\n\nSome of the highlights for me:\n\n- Fast, lazy execution, a great display format, and good type hinting/editor support for a great REPL experience.\n- Very well thought-out API and semantics (e.g. `isinstance(val, NumericValue)`?? That's beautiful!)\n- Fast and fairly complete string support, since I work with a lot of text data.\n- Extremely responsive maintainers. Sometimes I've submitted multiple feature requests and bug reports in a single day, and a PR has been merged by the next day.\n- Escape hatch to SQL. I didn't have to use that here, but if something isn't supported, you can always fall back to SQL.\n\nCheck out [The Ibis Website](https://ibis-project.org/) for more information.\n\n", + "engine": "jupyter", + "markdown": "---\ntitle: \"Exploring campaign finance data\"\nauthor: \"Nick Crews\"\ndate: \"2023-03-24\"\ncategories:\n - blog\n - data engineering\n - case study\n - duckdb\n - performance\n---\n\nHi! My name is [Nick Crews](https://www.linkedin.com/in/nicholas-b-crews/),\nand I'm a data engineer that looks at public campaign finance data.\n\nIn this post, I'll walk through how I use Ibis to explore public campaign contribution\ndata from the Federal Election Commission (FEC). We'll do some loading,\ncleaning, featurizing, and visualization. There will be filtering, sorting, grouping,\nand aggregation.\n\n## Downloading The Data\n\n::: {#e29f35c8 .cell execution_count=2}\n``` {.python .cell-code}\nfrom pathlib import Path\nfrom zipfile import ZipFile\nfrom urllib.request import urlretrieve\n\n# Download and unzip the 2018 individual contributions data\nurl = \"https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2018/indiv18.zip\"\nzip_path = Path(\"indiv18.zip\")\ncsv_path = Path(\"indiv18.csv\")\n\nif not zip_path.exists():\n urlretrieve(url, zip_path)\n\nif not csv_path.exists():\n with ZipFile(zip_path) as zip_file, csv_path.open(\"w\") as csv_file:\n for line in zip_file.open(\"itcont.txt\"):\n csv_file.write(line.decode())\n```\n:::\n\n\n## Loading the data\n\nNow that we have our raw data in a .csv format, let's load it into Ibis,\nusing the duckdb backend.\n\nNote that a 4.3 GB .csv would be near the limit of what pandas could\nhandle on my laptop with 16GB of RAM. In pandas, typically every time\nyou perform a transformation on the data, a copy of the data is made.\nI could only do a few transformations before I ran out of memory.\n\nWith Ibis, this problem is solved in two different ways.\n\nFirst, because they are designed to work with very large datasets,\nmany (all?) SQL backends support out of core operations.\nThe data lives on disk, and are only loaded in a streaming fashion\nwhen needed, and then written back to disk as the operation is performed.\n\nSecond, unless you explicitly ask for it, Ibis makes use of lazy\nevaluation. This means that when you ask for a result, the\nresult is not persisted in memory. Only the original source\ndata is persisted. Everything else is derived from this on the fly.\n\n::: {#0a6991f4 .cell execution_count=3}\n``` {.python .cell-code}\nimport ibis\nfrom ibis import _\n\nibis.options.interactive = True\n\n# The raw .csv file doesn't have column names, so we will add them in the next step.\nraw = ibis.read_csv(csv_path)\nraw\n```\n\n::: {.cell-output .cell-output-display execution_count=16}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ C00401224  A       M6      P       201804059101866001  24T     IND     STOUFFER, LEIGH    AMSTELVEEN    ZZ      1187RC     MYSELF             SELF EMPLOYED            05172017  10     C00458000  SA11AI_81445687  1217152  column18  EARMARKED FOR PROGRESSIVE CHANGE CAMPAIGN COMMITTEE (C00458000)  4050820181544765358 ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringint64stringstringstringstringstringstringstringstringstringint64stringstringint64stringstringint64               │\n├───────────┼────────┼────────┼────────┼────────────────────┼────────┼────────┼───────────────────┼──────────────┼────────┼───────────┼───────────────────┼─────────────────────────┼──────────┼───────┼───────────┼─────────────────┼─────────┼──────────┼─────────────────────────────────────────────────────────────────┼─────────────────────┤\n│ C00401224A     M6    P     20180405910186774824T   IND   STRAWS, JOYCE    OCOEE       FL    34761    SILVERSEA CRUISESRESERVATIONS SUPERVISOR0518201710C00000935SA11AI_815923361217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544770597 │\n│ C00401224A     M6    P     20180405910186774824T   IND   STRAWS, JOYCE    OCOEE       FL    34761    SILVERSEA CRUISESRESERVATIONS SUPERVISOR0519201715C00000935SA11AI_816275621217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544770598 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0513201735C00000935SA11AI_810479211217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765179 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0515201735C00000935SA11AI_812092091217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765180 │\n│ C00401224A     M6    P     20180405910186594224T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   051920175C00000935SA11AI_816052231217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765181 │\n│ C00401224A     M6    P     20180405910186594324T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0524201715C00000935SA11AI_822000221217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765182 │\n│ C00401224A     M6    P     20180405910186594324T   IND   STOTT, JIM       CAPE NEDDICKME    03902    NOT EMPLOYED     NOT EMPLOYED           05292017100C00213512SA11AI_825898341217152NULLEARMARKED FOR NANCY PELOSI FOR CONGRESS (C00213512)            4050820181544765184 │\n│ C00401224A     M6    P     20180405910186594424T   IND   STOTT, JIM       CAPE NEDDICKME    039020760NONE             NONE                   0530201735C00000935SA11AI_826437271217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544765185 │\n│ C00401224A     M6    P     20180405910186705024T   IND   STRANGE, WINIFREDANNA MSRIA  FL    34216    NOT EMPLOYED     NOT EMPLOYED           0516201725C00000935SA11AI_813259181217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544768505 │\n│ C00401224A     M6    P     20180405910186705124T   IND   STRANGE, WINIFREDANNA MSRIA  FL    34216    NOT EMPLOYED     NOT EMPLOYED           0523201725C00000935SA11AI_819911891217152NULLEARMARKED FOR DCCC (C00000935)                                 4050820181544768506 │\n│  │\n└───────────┴────────┴────────┴────────┴────────────────────┴────────┴────────┴───────────────────┴──────────────┴────────┴───────────┴───────────────────┴─────────────────────────┴──────────┴───────┴───────────┴─────────────────┴─────────┴──────────┴─────────────────────────────────────────────────────────────────┴─────────────────────┘\n
\n```\n:::\n:::\n\n\n::: {#ebb6e702 .cell execution_count=4}\n``` {.python .cell-code}\n# For a more comprehesive description of the columns and their meaning, see\n# https://www.fec.gov/campaign-finance-data/contributions-individuals-file-description/\ncolumns = {\n \"CMTE_ID\": \"keep\", # Committee ID\n \"AMNDT_IND\": \"drop\", # Amendment indicator. A = amendment, N = new, T = termination\n \"RPT_TP\": \"drop\", # Report type (monthly, quarterly, etc)\n \"TRANSACTION_PGI\": \"keep\", # Primary/general indicator\n \"IMAGE_NUM\": \"drop\", # Image number\n \"TRANSACTION_TP\": \"drop\", # Transaction type\n \"ENTITY_TP\": \"keep\", # Entity type\n \"NAME\": \"drop\", # Contributor name\n \"CITY\": \"keep\", # Contributor city\n \"STATE\": \"keep\", # Contributor state\n \"ZIP_CODE\": \"drop\", # Contributor zip code\n \"EMPLOYER\": \"drop\", # Contributor employer\n \"OCCUPATION\": \"drop\", # Contributor occupation\n \"TRANSACTION_DT\": \"keep\", # Transaction date\n \"TRANSACTION_AMT\": \"keep\", # Transaction amount\n # Other ID. For individual contributions will be null. For contributions from\n # other FEC committees, will be the committee ID of the other committee.\n \"OTHER_ID\": \"drop\",\n \"TRAN_ID\": \"drop\", # Transaction ID\n \"FILE_NUM\": \"drop\", # File number, unique number assigned to each report filed with the FEC\n \"MEMO_CD\": \"drop\", # Memo code\n \"MEMO_TEXT\": \"drop\", # Memo text\n \"SUB_ID\": \"drop\", # Submission ID. Unique number assigned to each transaction.\n}\n\nrenaming = {old: new for old, new in zip(raw.columns, columns.keys())}\nto_keep = [k for k, v in columns.items() if v == \"keep\"]\nkept = raw.relabel(renaming)[to_keep]\nkept\n```\n\n::: {.cell-output .cell-output-display execution_count=17}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    TRANSACTION_PGI  ENTITY_TP  CITY          STATE   TRANSACTION_DT  TRANSACTION_AMT ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringstringint64           │\n├───────────┼─────────────────┼───────────┼──────────────┼────────┼────────────────┼─────────────────┤\n│ C00401224P              IND      OCOEE       FL    05182017      10 │\n│ C00401224P              IND      OCOEE       FL    05192017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05132017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05152017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05192017      5 │\n│ C00401224P              IND      CAPE NEDDICKME    05242017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05292017      100 │\n│ C00401224P              IND      CAPE NEDDICKME    05302017      35 │\n│ C00401224P              IND      ANNA MSRIA  FL    05162017      25 │\n│ C00401224P              IND      ANNA MSRIA  FL    05232017      25 │\n│  │\n└───────────┴─────────────────┴───────────┴──────────────┴────────┴────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\n::: {#3f4ad522 .cell execution_count=5}\n``` {.python .cell-code}\n# 21 million rows\nkept.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=18}\n\n::: {.ansi-escaped-output}\n```{=html}\n
┌──────────┐\n│ 21730730 │\n└──────────┘
\n```\n:::\n\n:::\n:::\n\n\nHuh, what's up with those timings? Previewing the head only took a fraction of a second,\nbut finding the number of rows took 10 seconds.\n\nThat's because duckdb is scanning the .csv file on the fly every time we access it.\nSo we only have to read the first few lines to get that preview,\nbut we have to read the whole file to get the number of rows.\n\nNote that this isn't a feature of Ibis, but a feature of Duckdb. This what I think is\none of the strengths of Ibis: Ibis itself doesn't have to implement any of the\noptimimizations or features of the backends. Those backends can focus on what they do\nbest, and Ibis can get those things for free.\n\nSo, let's tell duckdb to actually read in the file to its native format so later accesses\nwill be faster. This will be a ~20 seconds that we'll only have to pay once.\n\n::: {#c45e7319 .cell execution_count=6}\n``` {.python .cell-code}\nkept = kept.cache()\nkept\n```\n\n::: {.cell-output .cell-output-display execution_count=19}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    TRANSACTION_PGI  ENTITY_TP  CITY          STATE   TRANSACTION_DT  TRANSACTION_AMT ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringstringint64           │\n├───────────┼─────────────────┼───────────┼──────────────┼────────┼────────────────┼─────────────────┤\n│ C00401224P              IND      OCOEE       FL    05182017      10 │\n│ C00401224P              IND      OCOEE       FL    05192017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05132017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05152017      35 │\n│ C00401224P              IND      CAPE NEDDICKME    05192017      5 │\n│ C00401224P              IND      CAPE NEDDICKME    05242017      15 │\n│ C00401224P              IND      CAPE NEDDICKME    05292017      100 │\n│ C00401224P              IND      CAPE NEDDICKME    05302017      35 │\n│ C00401224P              IND      ANNA MSRIA  FL    05162017      25 │\n│ C00401224P              IND      ANNA MSRIA  FL    05232017      25 │\n│  │\n└───────────┴─────────────────┴───────────┴──────────────┴────────┴────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\nLook, now accessing it only takes a fraction of a second!\n\n::: {#881326dd .cell execution_count=7}\n``` {.python .cell-code}\nkept.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=20}\n\n::: {.ansi-escaped-output}\n```{=html}\n
┌──────────┐\n│ 21730730 │\n└──────────┘
\n```\n:::\n\n:::\n:::\n\n\n### Committees Data\n\nThe contributions only list an opaque `CMTE_ID` column. We want to know which actual\ncommittee this is. Let's load the committees table so we can lookup from\ncommittee ID to committee name.\n\n::: {#ae8760f6 .cell execution_count=8}\n``` {.python .cell-code}\ndef read_committees():\n committees_url = \"https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2018/committee_summary_2018.csv\"\n # This just creates a view, it doesn't actually fetch the data yet\n tmp = ibis.read_csv(committees_url)\n tmp = tmp[\"CMTE_ID\", \"CMTE_NM\"]\n # The raw table contains multiple rows for each committee id, so lets pick\n # an arbitrary row for each committee id as the representative name.\n deduped = tmp.group_by(\"CMTE_ID\").agg(CMTE_NM=_.CMTE_NM.arbitrary())\n return deduped\n\n\ncomms = read_committees().cache()\ncomms\n```\n\n::: {.cell-output .cell-output-display execution_count=21}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ CMTE_ID    CMTE_NM                                                        ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstring                                                         │\n├───────────┼────────────────────────────────────────────────────────────────┤\n│ C00659441JASON ORTITAY FOR CONGRESS                                     │\n│ C00297911TEXAS FORESTRY ASSOCIATION FORESTRY POLITICAL ACTION COMMITTEE │\n│ C00340745WADDELL & REED FINANCIAL, INC. POLITICAL ACTION COMMITTEE      │\n│ C00679217CANTWELL-WARREN VICTORY FUND                                   │\n│ C00101204NATIONAL FISHERIES INSTITUTE (FISHPAC)                         │\n│ C00010520MEREDITH CORPORATION EMPLOYEES FUND FOR BETTER GOVERNMENT      │\n│ C00532788LAFAYETTE COUNTY DEMOCRATIC PARTY                              │\n│ C00128561TOLL BROS. INC. PAC                                            │\n│ C00510958WENDYROGERS.ORG                                                │\n│ C00665604COMMITTEE TO ELECT BILL EBBEN                                  │\n│                                                               │\n└───────────┴────────────────────────────────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\nNow add the committee name to the contributions table:\n\n::: {#8fe204d4 .cell execution_count=9}\n``` {.python .cell-code}\ntogether = kept.left_join(comms, \"CMTE_ID\").drop(\"CMTE_ID\", \"CMTE_ID_right\")\ntogether\n```\n\n::: {.cell-output .cell-output-display execution_count=22}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  ENTITY_TP  CITY              STATE   TRANSACTION_DT  TRANSACTION_AMT  CMTE_NM                                         ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringstringstringint64string                                          │\n├─────────────────┼───────────┼──────────────────┼────────┼────────────────┼─────────────────┼─────────────────────────────────────────────────┤\n│ P              IND      COHASSET        MA    01312017      230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      KEY LARGO       FL    01042017      5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      LOOKOUT MOUNTAINGA    01312017      230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      NORTH YARMOUTH  ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      ALPHARETTA      GA    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      HOLLIS CENTER   ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      FALMOUTH        ME    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│ P              IND      ALEXANDRIA      VA    01312017      384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC) │\n│                                                │\n└─────────────────┴───────────┴──────────────────┴────────┴────────────────┴─────────────────┴─────────────────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\n## Cleaning\n\nFirst, let's drop any contributions that don't have a committee name. There are only 6 of them.\n\n::: {#215670b2 .cell execution_count=10}\n``` {.python .cell-code}\n# We can do this fearlessly, no .copy() needed, because\n# everything in Ibis is immutable. If we did this in pandas,\n# we might start modifying the original DataFrame accidentally!\ncleaned = together\n\nhas_name = cleaned.CMTE_NM.notnull()\ncleaned = cleaned[has_name]\nhas_name.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=23}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ NotNull(CMTE_NM)  NotNull(CMTE_NM)_count ┃\n┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ booleanint64                  │\n├──────────────────┼────────────────────────┤\n│ True             │               21730724 │\n│ False            │                      6 │\n└──────────────────┴────────────────────────┘\n
\n```\n:::\n:::\n\n\nLet's look at the `ENTITY_TP` column. This represents the type of entity that\nmade the contribution:\n\n::: {#8e39507b .cell execution_count=11}\n``` {.python .cell-code}\ntogether.ENTITY_TP.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=24}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ ENTITY_TP  ENTITY_TP_count ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringint64           │\n├───────────┼─────────────────┤\n│ NULL5289 │\n│ CAN      13659 │\n│ COM      867 │\n│ IND      21687992 │\n│ ORG      18555 │\n│ PAC      3621 │\n│ PTY      49 │\n│ CCM      698 │\n└───────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\nWe only care about contributions from individuals.\n\nOnce we filter on this column, the contents of it are irrelevant, so let's drop it.\n\n::: {#e1453e27 .cell execution_count=12}\n``` {.python .cell-code}\ncleaned = together[_.ENTITY_TP == \"IND\"].drop(\"ENTITY_TP\")\n```\n:::\n\n\nIt looks like the `TRANSACTION_DT` column was a raw string like \"MMDDYYYY\",\nso let's convert that to a proper date type.\n\n::: {#bf3dadc7 .cell execution_count=13}\n``` {.python .cell-code}\nfrom ibis.expr.types import StringValue, DateValue\n\n\ndef mmddyyyy_to_date(val: StringValue) -> DateValue:\n return val.cast(str).lpad(8, \"0\").to_timestamp(\"%m%d%Y\").date()\n\n\ncleaned = cleaned.mutate(date=mmddyyyy_to_date(_.TRANSACTION_DT)).drop(\"TRANSACTION_DT\")\ncleaned\n```\n\n::: {.cell-output .cell-output-display execution_count=26}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  CITY              STATE   TRANSACTION_AMT  CMTE_NM                                          date       ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n│ stringstringstringint64stringdate       │\n├─────────────────┼──────────────────┼────────┼─────────────────┼─────────────────────────────────────────────────┼────────────┤\n│ P              COHASSET        MA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              KEY LARGO       FL    5000UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-04 │\n│ P              LOOKOUT MOUNTAINGA    230UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              NORTH YARMOUTH  ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              ALPHARETTA      GA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              HOLLIS CENTER   ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              FALMOUTH        ME    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│ P              ALEXANDRIA      VA    384UNUM GROUP POLITICAL ACTION COMMITTEE (UNUMPAC)2017-01-31 │\n│           │\n└─────────────────┴──────────────────┴────────┴─────────────────┴─────────────────────────────────────────────────┴────────────┘\n
\n```\n:::\n:::\n\n\nThe `TRANSACTION_PGI` column represents the type (primary, general, etc) of election,\nand the year. But it seems to be not very consistent:\n\n::: {#6cb98e2b .cell execution_count=14}\n``` {.python .cell-code}\ncleaned.TRANSACTION_PGI.topk(10)\n```\n\n::: {.cell-output .cell-output-display execution_count=27}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ TRANSACTION_PGI  CountStar() ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringint64       │\n├─────────────────┼─────────────┤\n│ P              17013596 │\n│ G2018          2095123 │\n│ P2018          1677183 │\n│ P2020          208501 │\n│ O2018          161874 │\n│ S2017          124336 │\n│ G2017          98401 │\n│ P2022          91136 │\n│ P2017          61153 │\n│ R2017          54281 │\n└─────────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\n::: {#463caa6b .cell execution_count=15}\n``` {.python .cell-code}\ndef get_election_type(pgi: StringValue) -> StringValue:\n \"\"\"Use the first letter of the TRANSACTION_PGI column to determine the election type\n\n If the first letter is not one of the known election stage, then return null.\n \"\"\"\n election_types = {\n \"P\": \"primary\",\n \"G\": \"general\",\n \"O\": \"other\",\n \"C\": \"convention\",\n \"R\": \"runoff\",\n \"S\": \"special\",\n \"E\": \"recount\",\n }\n first_letter = pgi[0]\n return first_letter.substitute(election_types, else_=ibis.null())\n\n\ncleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop(\n \"TRANSACTION_PGI\"\n)\ncleaned\n```\n\n::: {.cell-output .cell-output-display execution_count=28}\n```{=html}\n
┏━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CITY        STATE   TRANSACTION_AMT  CMTE_NM                    date        election_type ┃\n┡━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64stringdatestring        │\n├────────────┼────────┼─────────────────┼───────────────────────────┼────────────┼───────────────┤\n│ ATLANTA   GA    15NANCY PELOSI FOR CONGRESS2017-06-20primary       │\n│ AUSTIN    TX    15NANCY PELOSI FOR CONGRESS2017-06-04primary       │\n│ WASHINGTONDC    25NANCY PELOSI FOR CONGRESS2017-06-23primary       │\n│ HONOLULU  HI    10NANCY PELOSI FOR CONGRESS2017-04-20primary       │\n│ MAMARONECKNY    110NANCY PELOSI FOR CONGRESS2017-06-02primary       │\n│ REHOBOTH  MA    10NANCY PELOSI FOR CONGRESS2017-06-01primary       │\n│ BERKELEY  CA    25NANCY PELOSI FOR CONGRESS2017-06-05primary       │\n│ BEAUMONT  TX    25NANCY PELOSI FOR CONGRESS2017-04-12primary       │\n│ CONCORD   MA    200NANCY PELOSI FOR CONGRESS2017-05-05primary       │\n│ OXNARD    CA    15NANCY PELOSI FOR CONGRESS2017-03-31primary       │\n│              │\n└────────────┴────────┴─────────────────┴───────────────────────────┴────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\nThat worked well! There are 0 nulls in the resulting column, so we always were\nable to determine the election type.\n\n::: {#ead49c9e .cell execution_count=16}\n``` {.python .cell-code}\ncleaned.election_type.topk(10)\n```\n\n::: {.cell-output .cell-output-display execution_count=29}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ election_type  CountStar() ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringint64       │\n├───────────────┼─────────────┤\n│ primary      19061953 │\n│ general      2216685 │\n│ other        161965 │\n│ special      149572 │\n│ runoff       69637 │\n│ convention   22453 │\n│ recount      5063 │\n│ NULL664 │\n└───────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nAbout 1/20 of transactions are negative. These could represent refunds, or they\ncould be data entry errors. Let's drop them to keep it simple.\n\n::: {#ee56a3f3 .cell execution_count=17}\n``` {.python .cell-code}\nabove_zero = cleaned.TRANSACTION_AMT > 0\ncleaned = cleaned[above_zero]\nabove_zero.value_counts()\n```\n\n::: {.cell-output .cell-output-display execution_count=30}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ Greater(TRANSACTION_AMT, 0)  Greater(TRANSACTION_AMT, 0)_count ┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ booleanint64                             │\n├─────────────────────────────┼───────────────────────────────────┤\n│ True                        │                          20669809 │\n│ False                       │                           1018183 │\n└─────────────────────────────┴───────────────────────────────────┘\n
\n```\n:::\n:::\n\n\n## Adding Features\n\nNow that the data is cleaned up to a usable format, let's add some features.\n\nFirst, it's useful to categorize donations by size, placing them into buckets\nof small, medium, large, etc.\n\n::: {#0ccc57df .cell execution_count=18}\n``` {.python .cell-code}\nedges = [\n 10,\n 50,\n 100,\n 500,\n 1000,\n 5000,\n]\nlabels = [\n \"<10\",\n \"10-50\",\n \"50-100\",\n \"100-500\",\n \"500-1000\",\n \"1000-5000\",\n \"5000+\",\n]\n\n\ndef bucketize(vals, edges, str_labels):\n # Uses Ibis's .bucket() method to create a categorical column\n int_labels = vals.bucket(edges, include_under=True, include_over=True)\n # Map the integer labels to the string labels\n int_to_str = {str(i): s for i, s in enumerate(str_labels)}\n return int_labels.cast(str).substitute(int_to_str)\n\n\nfeatured = cleaned.mutate(amount_bucket=bucketize(_.TRANSACTION_AMT, edges, labels))\nfeatured\n```\n\n::: {.cell-output .cell-output-display execution_count=31}\n```{=html}\n
┏━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CITY          STATE   TRANSACTION_AMT  CMTE_NM                date        election_type  amount_bucket ┃\n┡━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64stringdatestringstring        │\n├──────────────┼────────┼─────────────────┼───────────────────────┼────────────┼───────────────┼───────────────┤\n│ REMINGTON   IN    50AMERICA'S LIBERTY PAC2017-05-30primary      50-100        │\n│ REMINGTON   IN    50AMERICA'S LIBERTY PAC2017-06-05primary      50-100        │\n│ VANCOUVER   WA    100AMERICA'S LIBERTY PAC2017-06-07primary      100-500       │\n│ SOLANA BEACHCA    500AMERICA'S LIBERTY PAC2017-06-26primary      500-1000      │\n│ HILLSDALE   MI    250AMERICA'S LIBERTY PAC2017-05-15primary      100-500       │\n│ MIDDLEBURY  VT    500NBT PAC FEDERAL FUND 2017-06-05primary      500-1000      │\n│ WILLISTON   VT    500NBT PAC FEDERAL FUND 2017-05-30primary      500-1000      │\n│ GLENMONT    NY    350NBT PAC FEDERAL FUND 2017-06-01primary      100-500       │\n│ NORWICH     NY    250NBT PAC FEDERAL FUND 2017-05-31primary      100-500       │\n│ CLIFTON PARKNY    250NBT PAC FEDERAL FUND 2017-06-26primary      100-500       │\n│              │\n└──────────────┴────────┴─────────────────┴───────────────────────┴────────────┴───────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n## Analysis\n\n### By donation size\n\nOne thing we can look at is the donation breakdown by size:\n- Are most donations small or large?\n- Where do politicians/committees get most of their money from? Large or small donations?\n\nWe also will compare performance of Ibis vs pandas during this groupby.\n\n::: {#6c9dae32 .cell execution_count=19}\n``` {.python .cell-code}\ndef summary_by(table, by):\n return table.group_by(by).agg(\n n_donations=_.count(),\n total_amount=_.TRANSACTION_AMT.sum(),\n mean_amount=_.TRANSACTION_AMT.mean(),\n median_amount=_.TRANSACTION_AMT.approx_median(),\n )\n\n\ndef summary_by_pandas(df, by):\n return df.groupby(by, as_index=False).agg(\n n_donations=(\"election_type\", \"count\"),\n total_amount=(\"TRANSACTION_AMT\", \"sum\"),\n mean_amount=(\"TRANSACTION_AMT\", \"mean\"),\n median_amount=(\"TRANSACTION_AMT\", \"median\"),\n )\n\n\n# persist the input data so the following timings of the group_by are accurate.\nsubset = featured[\"election_type\", \"amount_bucket\", \"TRANSACTION_AMT\"]\nsubset = subset.cache()\npandas_subset = subset.execute()\n```\n:::\n\n\nLet's take a look at what we are actually computing:\n\n::: {#1b310e3e .cell execution_count=20}\n``` {.python .cell-code}\nby_type_and_bucket = summary_by(subset, [\"election_type\", \"amount_bucket\"])\nby_type_and_bucket\n```\n\n::: {.cell-output .cell-output-display execution_count=33}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ election_type  amount_bucket  n_donations  total_amount  mean_amount   median_amount ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringstringint64int64float64int64         │\n├───────────────┼───────────────┼─────────────┼──────────────┼──────────────┼───────────────┤\n│ primary      500-1000     634677334630687527.245649500 │\n│ general      5000+        31254449637314238.8393607537 │\n│ special      500-1000     78114003293512.519908500 │\n│ runoff       100-500      181933088289169.751498100 │\n│ convention   500-1000     1824945321518.268092500 │\n│ general      <10          1158735367424.6321585 │\n│ general      50-100       3043631618431253.17437450 │\n│ general      1000-5000    2461014600252421869.2538511978 │\n│ general      10-50        6607871441158821.80973325 │\n│ other        500-1000     11962535525.504202500 │\n│  │\n└───────────────┴───────────────┴─────────────┴──────────────┴──────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\nOK, now let's do our timings.\n\nOne interesting thing to pay attention to here is the execution time for the following\ngroupby. Before, we could get away with lazy execution: because we only wanted to preview\nthe first few rows, we only had to compute the first few rows, so all our previews were\nvery fast.\n\nBut now, as soon as we do a groupby, we have to actually go through the whole dataset\nin order to compute the aggregate per group. So this is going to be slower. BUT,\nduckdb is still quite fast. It only takes milliseconds to groupby-agg all 20 million rows!\n\n::: {#32424707 .cell execution_count=21}\n``` {.python .cell-code}\n%timeit summary_by(subset, [\"election_type\", \"amount_bucket\"]).execute() # .execute() so we actually fetch the data\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n161 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n```\n:::\n:::\n\n\nNow let's try the same thing in pandas:\n\n::: {#cc653b7f .cell execution_count=22}\n``` {.python .cell-code}\n%timeit summary_by_pandas(pandas_subset, [\"election_type\", \"amount_bucket\"])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n2.19 s ± 6.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n```\n:::\n:::\n\n\nIt takes about 4 seconds, which is about 10 times slower than duckdb.\nAt this scale, it again doesn't matter,\nbut you could imagine with a dataset much larger than this, it would matter.\n\nLet's also think about memory usage:\n\n::: {#c967896c .cell execution_count=23}\n``` {.python .cell-code}\npandas_subset.memory_usage(deep=True).sum() / 1e9 # GB\n```\n\n::: {.cell-output .cell-output-display execution_count=36}\n```\n2.782586667\n```\n:::\n:::\n\n\nThe source dataframe is couple gigabytes, so probably during the groupby,\nthe peak memory usage is going to be a bit higher than this. You could use a profiler\nsuch as [FIL](https://github.com/pythonspeed/filprofiler) if you wanted an exact number,\nI was too lazy to use that here.\n\nAgain, this works on my laptop at this dataset size, but much larger than this and I'd\nstart having problems. Duckdb on the other hand is designed around working out of core\nso it should scale to datasets into the hundreds of gigabytes, much larger than your\ncomputer's RAM.\n\n### Back to analysis\n\nOK, let's plot the result of that groupby.\n\nSurprise! (Or maybe not...) Most donations are small. But most of the money comes\nfrom donations larger than $1000.\n\nWell if that's the case, why do politicians spend so much time soliciting small\ndonations? One explanation is that they can use the number of donations\nas a marketing pitch, to show how popular they are, and thus how viable of a\ncandidate they are.\n\nThis also might explain whose interests are being served by our politicians.\n\n::: {#6808107a .cell execution_count=24}\n``` {.python .cell-code}\nimport altair as alt\n\n# Do some bookkeeping so the buckets are displayed smallest to largest on the charts\nbucket_col = alt.Column(\"amount_bucket:N\", sort=labels)\n\nn_by_bucket = (\n alt.Chart(by_type_and_bucket.execute())\n .mark_bar()\n .encode(\n x=bucket_col,\n y=\"n_donations:Q\",\n color=\"election_type:N\",\n )\n)\ntotal_by_bucket = (\n alt.Chart(by_type_and_bucket.execute())\n .mark_bar()\n .encode(\n x=bucket_col,\n y=\"total_amount:Q\",\n color=\"election_type:N\",\n )\n)\nn_by_bucket | total_by_bucket\n```\n\n::: {.cell-output .cell-output-display execution_count=37}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By election stage\n\nLet's look at how donations break down by election stage. Do people donate\ndifferently for primary elections vs general elections?\n\nLet's ignore everything but primary and general elections, since they are the\nmost common, and arguably the most important.\n\n::: {#8a758b63 .cell execution_count=25}\n``` {.python .cell-code}\ngb2 = by_type_and_bucket[_.election_type.isin((\"primary\", \"general\"))]\nn_donations_per_election_type = _.n_donations.sum().over(group_by=\"election_type\")\nfrac = _.n_donations / n_donations_per_election_type\ngb2 = gb2.mutate(frac_n_donations_per_election_type=frac)\ngb2\n```\n\n::: {.cell-output .cell-output-display execution_count=38}\n```{=html}\n
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ election_type  amount_bucket  n_donations  total_amount  mean_amount   median_amount  frac_n_donations_per_election_type ┃\n┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringint64int64float64int64float64                            │\n├───────────────┼───────────────┼─────────────┼──────────────┼──────────────┼───────────────┼────────────────────────────────────┤\n│ general      <10          1158735367424.63215850.052544 │\n│ general      50-100       3043631618431253.174374500.138017 │\n│ general      1000-5000    2461014600252421869.25385119610.111598 │\n│ general      10-50        6607871441158821.809733250.299642 │\n│ general      100-500      700821123174568175.7575301500.317796 │\n│ general      500-1000     17418291015697522.5321625000.078985 │\n│ general      5000+        31254449637314238.83936076010.001417 │\n│ primary      5000+        44085155837111635349.237065100000.002422 │\n│ primary      100-500      3636287637353634175.2759431500.199765 │\n│ primary      500-1000     634677334630687527.2456495000.034867 │\n│  │\n└───────────────┴───────────────┴─────────────┴──────────────┴──────────────┴───────────────┴────────────────────────────────────┘\n
\n```\n:::\n:::\n\n\nIt looks like primary elections get a larger proportion of small donations.\n\n::: {#30710ce2 .cell execution_count=26}\n``` {.python .cell-code}\nalt.Chart(gb2.execute()).mark_bar().encode(\n x=\"election_type:O\",\n y=\"frac_n_donations_per_election_type:Q\",\n color=bucket_col,\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=39}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By recipient\n\nLet's look at the top players. Who gets the most donations?\n\nFar and away it is ActBlue, which acts as a conduit for donations to Democratic\ninterests.\n\nBeto O'Rourke is the top individual politician, hats off to him!\n\n::: {#97c0a2c8 .cell execution_count=27}\n``` {.python .cell-code}\nby_recip = summary_by(featured, \"CMTE_NM\")\nby_recip\n```\n\n::: {.cell-output .cell-output-display execution_count=40}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ CMTE_NM                                                           n_donations  total_amount  mean_amount  median_amount ┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringint64int64float64int64         │\n├──────────────────────────────────────────────────────────────────┼─────────────┼──────────────┼─────────────┼───────────────┤\n│ INDIANA DENTAL PAC                                              11162236560.684685410 │\n│ BEAM SUNTORY INC POLITICAL ACTION COMMITTEE                     40764806159.22850165 │\n│ AMEDISYS, INC. POLITICAL ACTION COMMITTEE                       13225000189.39393975 │\n│ PIEDMONT TRIAD ANESTHESIA P A FEDERAL PAC                       13290375684.659091600 │\n│ AHOLD DELHAIZE USA, INC POLITICAL ACTION COMMITTEE              36948062130.249322100 │\n│ DIMITRI FOR CONGRESS                                            8734719399.068966250 │\n│ RELX INC. POLITICAL ACTION COMMITTEE                            549130690855.89291634 │\n│ MAKING INVESTMENTS MAJORITY INSURED PAC                         14306002185.7142861000 │\n│ AMERICAN ACADEMY OF OTOLARYNGOLOGY-HEAD AND NECK SURGERY ENT PAC765285756373.537255365 │\n│ MIMI WALTERS VICTORY FUND                                       84025148242993.8380952506 │\n│  │\n└──────────────────────────────────────────────────────────────────┴─────────────┴──────────────┴─────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n::: {#56418e6e .cell execution_count=28}\n``` {.python .cell-code}\ntop_recip = by_recip.order_by(ibis.desc(\"n_donations\")).head(10)\nalt.Chart(top_recip.execute()).mark_bar().encode(\n x=alt.X(\"CMTE_NM:O\", sort=\"-y\"),\n y=\"n_donations:Q\",\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=41}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By Location\n\nWhere are the largest donations coming from?\n\n::: {#55b19fc3 .cell execution_count=29}\n``` {.python .cell-code}\nf2 = featured.mutate(loc=_.CITY + \", \" + _.STATE).drop(\"CITY\", \"STATE\")\nby_loc = summary_by(f2, \"loc\")\n# Drop the places with a small number of donations so we're\n# resistant to outliers for the mean\nby_loc = by_loc[_.n_donations > 1000]\nby_loc\n```\n\n::: {.cell-output .cell-output-display execution_count=42}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ loc              n_donations  total_amount  mean_amount  median_amount ┃\n┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ stringint64int64float64int64         │\n├─────────────────┼─────────────┼──────────────┼─────────────┼───────────────┤\n│ NAZARETH, PA   146013871095.00684938 │\n│ FULSHEAR, TX   1504346778230.57047950 │\n│ GLOUCESTER, MA 4956563331113.66646525 │\n│ NORMAN, OK     6195945333152.59612635 │\n│ OAK PARK, IL   120173413138284.02579739 │\n│ AUSTIN, TX     18986533315922175.47163538 │\n│ MIAMI BEACH, FL1282510598453826.390097100 │\n│ SAN ANTONIO, TX14052918925978134.67667235 │\n│ HAMBURG, NY    232217025473.3221368 │\n│ PITTSBURGH, PA 7420814358578193.49097142 │\n│  │\n└─────────────────┴─────────────┴──────────────┴─────────────┴───────────────┘\n
\n```\n:::\n:::\n\n\n::: {#cc1697c5 .cell execution_count=30}\n``` {.python .cell-code}\ndef top_by(col):\n top = by_loc.order_by(ibis.desc(col)).head(10)\n return (\n alt.Chart(top.execute())\n .mark_bar()\n .encode(\n x=alt.X('loc:O', sort=\"-y\"),\n y=col,\n )\n )\n\n\ntop_by(\"n_donations\") | top_by(\"total_amount\") | top_by(\"mean_amount\") | top_by(\n \"median_amount\"\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=43}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n### By month\n\nWhen do the donations come in?\n\n::: {#0d055d90 .cell execution_count=31}\n``` {.python .cell-code}\nby_month = summary_by(featured, _.date.month().name(\"month_int\"))\n# Sorta hacky, .substritute doesn't work to change dtypes (yet?)\n# so we cast to string and then do our mapping\nmonth_map = {\n \"1\": \"Jan\",\n \"2\": \"Feb\",\n \"3\": \"Mar\",\n \"4\": \"Apr\",\n \"5\": \"May\",\n \"6\": \"Jun\",\n \"7\": \"Jul\",\n \"8\": \"Aug\",\n \"9\": \"Sep\",\n \"10\": \"Oct\",\n \"11\": \"Nov\",\n \"12\": \"Dec\",\n}\nby_month = by_month.mutate(month_str=_.month_int.cast(str).substitute(month_map))\nby_month\n```\n\n::: {.cell-output .cell-output-display execution_count=44}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓\n┃ month_int  n_donations  total_amount  mean_amount  median_amount  month_str ┃\n┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩\n│ int32int64int64float64int64string    │\n├───────────┼─────────────┼──────────────┼─────────────┼───────────────┼───────────┤\n│      NULL1514250297165.321664100NULL      │\n│         1348979174837854500.998209124Jan       │\n│         2581646255997655440.126219100Feb       │\n│         31042577430906797413.30932681Mar       │\n│         41088244299252692274.98676050Apr       │\n│         51374247387317192281.83957648May       │\n│         61667285465305247279.07961044Jun       │\n│         71607053320528605199.45117235Jul       │\n│         82023466473544182234.02626135Aug       │\n│         92583847697888624270.09672938Sep       │\n│                  │\n└───────────┴─────────────┴──────────────┴─────────────┴───────────────┴───────────┘\n
\n```\n:::\n:::\n\n\n::: {#7002ddb8 .cell execution_count=32}\n``` {.python .cell-code}\nmonths_in_order = list(month_map.values())\nalt.Chart(by_month.execute()).mark_bar().encode(\n x=alt.X(\"month_str:O\", sort=months_in_order),\n y=\"n_donations:Q\",\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=45}\n```{=html}\n\n\n
\n\n```\n:::\n:::\n\n\n## Conclusion\n\nThanks for following along! I hope you've learned something about Ibis, and\nmaybe even about campaign finance.\n\nIbis is a great tool for exploring data. I now find myself reaching for it\nwhen in the past I would have reached for pandas.\n\nSome of the highlights for me:\n\n- Fast, lazy execution, a great display format, and good type hinting/editor support for a great REPL experience.\n- Very well thought-out API and semantics (e.g. `isinstance(val, NumericValue)`?? That's beautiful!)\n- Fast and fairly complete string support, since I work with a lot of text data.\n- Extremely responsive maintainers. Sometimes I've submitted multiple feature requests and bug reports in a single day, and a PR has been merged by the next day.\n- Escape hatch to SQL. I didn't have to use that here, but if something isn't supported, you can always fall back to SQL.\n\nCheck out [The Ibis Website](https://ibis-project.org/) for more information.\n\n", "supporting": [ - "index_files/figure-html" + "index_files" ], "filters": [], "includes": { "include-in-header": [ - "\n\n\n" + "\n\n\n" ] } } diff --git a/docs/_quarto.yml b/docs/_quarto.yml index b3c11e4494ff..d2d3ec3c1af8 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -298,10 +298,6 @@ quartodoc: - name: param dynamic: true signature_name: full - - name: NA - # Ideally exposed under `ibis` but that doesn't seem to work?? - package: ibis.expr.api - signature_name: full - name: "null" dynamic: true signature_name: full diff --git a/docs/posts/campaign-finance/index.qmd b/docs/posts/campaign-finance/index.qmd index 3d8d9fc19330..a2a0a287e388 100644 --- a/docs/posts/campaign-finance/index.qmd +++ b/docs/posts/campaign-finance/index.qmd @@ -245,7 +245,7 @@ def get_election_type(pgi: StringValue) -> StringValue: "E": "recount", } first_letter = pgi[0] - return first_letter.substitute(election_types, else_=ibis.NA) + return first_letter.substitute(election_types, else_=ibis.null()) cleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop( diff --git a/docs/tutorials/ibis-for-pandas-users.qmd b/docs/tutorials/ibis-for-pandas-users.qmd index 876fe4ac068f..e0fc2f5908e5 100644 --- a/docs/tutorials/ibis-for-pandas-users.qmd +++ b/docs/tutorials/ibis-for-pandas-users.qmd @@ -507,7 +507,7 @@ represented by `NaN`. This can be confusing when working with numeric data, since `NaN` is also a valid floating point value (along with `+/-inf`). In Ibis, we try to be more precise: All data types are nullable, and we use -`ibis.NA` to represent `NULL` values, and all datatypes have a `.isnull()` method. +`ibis.null()` to represent `NULL` values, and all datatypes have a `.isnull()` method. For floating point values, we use different values for `NaN` and `+/-inf`, and there are the additional methods `.isnan()` and `.isinf()`. diff --git a/docs/tutorials/ibis-for-sql-users.qmd b/docs/tutorials/ibis-for-sql-users.qmd index 577f7b015111..534090bfce64 100644 --- a/docs/tutorials/ibis-for-sql-users.qmd +++ b/docs/tutorials/ibis-for-sql-users.qmd @@ -522,10 +522,10 @@ ibis.to_sql(expr) ### Using `NULL` in expressions -To use `NULL` in an expression, either use the special `ibis.NA` value: +To use `NULL` in an expression, use `ibis.null()` value: ```{python} -pos_two = (t.two > 0).ifelse(t.two, ibis.NA) +pos_two = (t.two > 0).ifelse(t.two, ibis.null()) expr = t.mutate(two_positive=pos_two) ibis.to_sql(expr) ``` diff --git a/ibis/__init__.py b/ibis/__init__.py index 2ec14e182330..6bc98966502b 100644 --- a/ibis/__init__.py +++ b/ibis/__init__.py @@ -4,6 +4,9 @@ __version__ = "9.0.0" +import warnings +from typing import Any + from ibis import examples, util from ibis.backends import BaseBackend from ibis.common.exceptions import IbisError @@ -36,7 +39,7 @@ def __dir__() -> list[str]: return sorted(out) -def __getattr__(name: str) -> BaseBackend: +def load_backend(name: str) -> BaseBackend: """Load backends in a lazy way with `ibis.`. This also registers the backend options. @@ -52,6 +55,7 @@ def __getattr__(name: str) -> BaseBackend: attribute is "cached", so this function is only called the first time. """ + entry_points = {ep for ep in util.backend_entry_points() if ep.name == name} if not entry_points: @@ -125,3 +129,18 @@ def connect(*args, **kwargs): setattr(proxy, name, getattr(backend, name)) return proxy + + +def __getattr__(name: str) -> Any: + if name == "NA": + warnings.warn( + "The 'ibis.NA' constant is deprecated as of v9.1 and will be removed in a future " + "version. Use 'ibis.null()' instead.", + DeprecationWarning, + stacklevel=2, + ) + import ibis + + return ibis.null() + else: + return load_backend(name) diff --git a/ibis/backends/clickhouse/tests/test_functions.py b/ibis/backends/clickhouse/tests/test_functions.py index 1385ba7a34ab..dfe9d5e0f01e 100644 --- a/ibis/backends/clickhouse/tests/test_functions.py +++ b/ibis/backends/clickhouse/tests/test_functions.py @@ -116,8 +116,8 @@ def test_isnull_notnull(con, expr, expected): ("expr", "expected"), [ (ibis.coalesce(5, None, 4), 5), - (ibis.coalesce(ibis.NA, 4, ibis.NA), 4), - (ibis.coalesce(ibis.NA, ibis.NA, 3.14), 3.14), + (ibis.coalesce(ibis.null(), 4, ibis.null()), 4), + (ibis.coalesce(ibis.null(), ibis.null(), 3.14), 3.14), ], ) def test_coalesce(con, expr, expected): @@ -127,7 +127,7 @@ def test_coalesce(con, expr, expected): @pytest.mark.parametrize( ("expr", "expected"), [ - (ibis.NA.fill_null(5), 5), + (ibis.null().fill_null(5), 5), (L(5).fill_null(10), 5), (L(5).nullif(5), None), (L(10).nullif(5), 10), @@ -150,7 +150,7 @@ def test_fill_null_nullif(con, expr, expected): (L(datetime(2015, 9, 1, hour=14, minute=48, second=5)), "DateTime"), (L(date(2015, 9, 1)), "Date"), param( - ibis.NA, + ibis.null(), "Null", marks=pytest.mark.xfail( raises=AssertionError, @@ -418,7 +418,7 @@ def test_numeric_builtins_work(alltypes, df): def test_null_column(alltypes): t = alltypes nrows = t.count().execute() - expr = t.mutate(na_column=ibis.NA).na_column + expr = t.mutate(na_column=ibis.null()).na_column result = expr.execute() expected = pd.Series([None] * nrows, name="na_column") tm.assert_series_equal(result, expected) diff --git a/ibis/backends/dask/tests/test_window.py b/ibis/backends/dask/tests/test_window.py index ef2249dd099d..2a6d17c67e13 100644 --- a/ibis/backends/dask/tests/test_window.py +++ b/ibis/backends/dask/tests/test_window.py @@ -20,7 +20,7 @@ def sort_kind(): return "mergesort" -default = pytest.mark.parametrize("default", [ibis.NA, ibis.literal("a")]) +default = pytest.mark.parametrize("default", [ibis.null(), ibis.literal("a")]) row_offset = pytest.mark.parametrize("row_offset", list(map(ibis.literal, [-1, 1, 0]))) range_offset = pytest.mark.parametrize( "range_offset", @@ -48,7 +48,7 @@ def test_lead(con, t, df, row_offset, default, row_window): expr = t.dup_strings.lead(row_offset, default=default).over(row_window) result = expr.execute() expected = df.dup_strings.shift(con.execute(-row_offset)).compute() - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected, check_names=False) @@ -59,7 +59,7 @@ def test_lag(con, t, df, row_offset, default, row_window): expr = t.dup_strings.lag(row_offset, default=default).over(row_window) result = expr.execute() expected = df.dup_strings.shift(con.execute(row_offset)).compute() - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected, check_names=False) @@ -78,7 +78,7 @@ def test_lead_delta(con, t, pandas_df, range_offset, default, range_window): .reindex(pandas_df.plain_datetimes_naive) .reset_index(drop=True) ) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected, check_names=False) @@ -98,7 +98,7 @@ def test_lag_delta(t, con, pandas_df, range_offset, default, range_window): .reindex(pandas_df.plain_datetimes_naive) .reset_index(drop=True) ) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected, check_names=False) diff --git a/ibis/backends/impala/tests/test_case_exprs.py b/ibis/backends/impala/tests/test_case_exprs.py index c24d3671abda..a195928b1221 100644 --- a/ibis/backends/impala/tests/test_case_exprs.py +++ b/ibis/backends/impala/tests/test_case_exprs.py @@ -100,6 +100,6 @@ def test_identical_to(mockcon, snapshot): def test_identical_to_special_case(snapshot): - expr = ibis.NA.cast("int64").identical_to(ibis.NA.cast("int64")).name("tmp") + expr = ibis.null().cast("int64").identical_to(ibis.null().cast("int64")).name("tmp") result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") diff --git a/ibis/backends/impala/tests/test_exprs.py b/ibis/backends/impala/tests/test_exprs.py index 2919d7946ee3..45a0ac96d76e 100644 --- a/ibis/backends/impala/tests/test_exprs.py +++ b/ibis/backends/impala/tests/test_exprs.py @@ -384,7 +384,7 @@ def test_decimal_timestamp_builtins(con): dc * 2, dc**2, dc.cast("double"), - api.ifelse(table.l_discount > 0, dc * table.l_discount, api.NA), + api.ifelse(table.l_discount > 0, dc * table.l_discount, api.null()), dc.fill_null(0), ts < (ibis.now() + ibis.interval(months=3)), ts < (ibis.timestamp("2005-01-01") + ibis.interval(months=3)), @@ -632,10 +632,10 @@ def test_unions_with_ctes(con, alltypes): @pytest.mark.parametrize( ("left", "right", "expected"), [ - (ibis.NA.cast("int64"), ibis.NA.cast("int64"), True), + (ibis.null().cast("int64"), ibis.null().cast("int64"), True), (L(1), L(1), True), - (ibis.NA.cast("int64"), L(1), False), - (L(1), ibis.NA.cast("int64"), False), + (ibis.null().cast("int64"), L(1), False), + (L(1), ibis.null().cast("int64"), False), (L(0), L(1), False), (L(1), L(0), False), ], diff --git a/ibis/backends/pandas/tests/test_window.py b/ibis/backends/pandas/tests/test_window.py index 791f29133abb..d588120b8fd4 100644 --- a/ibis/backends/pandas/tests/test_window.py +++ b/ibis/backends/pandas/tests/test_window.py @@ -20,7 +20,7 @@ def sort_kind(): return "mergesort" -default = pytest.mark.parametrize("default", [ibis.NA, ibis.literal("a")]) +default = pytest.mark.parametrize("default", [ibis.null(), ibis.literal("a")]) row_offset = pytest.mark.parametrize("row_offset", list(map(ibis.literal, [-1, 1, 0]))) range_offset = pytest.mark.parametrize( "range_offset", @@ -49,7 +49,7 @@ def test_lead(t, df, row_offset, default, row_window): expr = t.dup_strings.lead(row_offset, default=default).over(row_window) result = expr.execute() expected = df.dup_strings.shift(con.execute(-row_offset)) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @@ -61,7 +61,7 @@ def test_lag(t, df, row_offset, default, row_window): expr = t.dup_strings.lag(row_offset, default=default).over(row_window) result = expr.execute() expected = df.dup_strings.shift(con.execute(row_offset)) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @@ -80,7 +80,7 @@ def test_lead_delta(t, df, range_offset, default, range_window): .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) @@ -100,7 +100,7 @@ def test_lag_delta(t, df, range_offset, default, range_window): .reindex(df.plain_datetimes_naive) .reset_index(drop=True) ) - if default is not ibis.NA: + if default is not ibis.null(): expected = expected.fillna(con.execute(default)) tm.assert_series_equal(result, expected.rename("tmp")) diff --git a/ibis/backends/postgres/tests/test_functions.py b/ibis/backends/postgres/tests/test_functions.py index 852431d3ebdf..93467491a3d4 100644 --- a/ibis/backends/postgres/tests/test_functions.py +++ b/ibis/backends/postgres/tests/test_functions.py @@ -150,7 +150,7 @@ def test_strftime(con, pattern): [ param(L("foo_bar"), "text", id="text"), param(L(5), "integer", id="integer"), - param(ibis.NA, "null", id="null"), + param(ibis.null(), "null", id="null"), # TODO(phillipc): should this really be double? param(L(1.2345), "numeric", id="numeric"), param( @@ -335,7 +335,7 @@ def test_regexp_extract(con, expr, expected): @pytest.mark.parametrize( ("expr", "expected"), [ - param(ibis.NA.fill_null(5), 5, id="filled"), + param(ibis.null().fill_null(5), 5, id="filled"), param(L(5).fill_null(10), 5, id="not_filled"), param(L(5).nullif(5), None, id="nullif_null"), param(L(10).nullif(5), 10, id="nullif_not_null"), @@ -349,8 +349,8 @@ def test_fill_null_nullif(con, expr, expected): ("expr", "expected"), [ param(ibis.coalesce(5, None, 4), 5, id="first"), - param(ibis.coalesce(ibis.NA, 4, ibis.NA), 4, id="second"), - param(ibis.coalesce(ibis.NA, ibis.NA, 3.14), 3.14, id="third"), + param(ibis.coalesce(ibis.null(), 4, ibis.null()), 4, id="second"), + param(ibis.coalesce(ibis.null(), ibis.null(), 3.14), 3.14, id="third"), ], ) def test_coalesce(con, expr, expected): @@ -360,12 +360,12 @@ def test_coalesce(con, expr, expected): @pytest.mark.parametrize( ("expr", "expected"), [ - param(ibis.coalesce(ibis.NA, ibis.NA), None, id="all_null"), + param(ibis.coalesce(ibis.null(), ibis.null()), None, id="all_null"), param( ibis.coalesce( - ibis.NA.cast("int8"), - ibis.NA.cast("int8"), - ibis.NA.cast("int8"), + ibis.null().cast("int8"), + ibis.null().cast("int8"), + ibis.null().cast("int8"), ), None, id="all_nulls_with_all_cast", @@ -377,7 +377,7 @@ def test_coalesce_all_na(con, expr, expected): def test_coalesce_all_na_double(con): - expr = ibis.coalesce(ibis.NA, ibis.NA, ibis.NA.cast("double")) + expr = ibis.coalesce(ibis.null(), ibis.null(), ibis.null().cast("double")) assert np.isnan(con.execute(expr)) @@ -815,14 +815,14 @@ def test_first_last_value(alltypes, df, func, expected_index): def test_null_column(alltypes): t = alltypes nrows = t.count().execute() - expr = t.mutate(na_column=ibis.NA).na_column + expr = t.mutate(na_column=ibis.null()).na_column result = expr.execute() tm.assert_series_equal(result, pd.Series([None] * nrows, name="na_column")) def test_null_column_union(alltypes, df): t = alltypes - s = alltypes[["double_col"]].mutate(string_col=ibis.NA.cast("string")) + s = alltypes[["double_col"]].mutate(string_col=ibis.null().cast("string")) expr = t[["double_col", "string_col"]].union(s) result = expr.execute() nrows = t.count().execute() diff --git a/ibis/backends/risingwave/tests/test_functions.py b/ibis/backends/risingwave/tests/test_functions.py index c63f2bd64419..89c012e7f026 100644 --- a/ibis/backends/risingwave/tests/test_functions.py +++ b/ibis/backends/risingwave/tests/test_functions.py @@ -166,7 +166,7 @@ def test_regexp(con, expr, expected): @pytest.mark.parametrize( ("expr", "expected"), [ - param(ibis.NA.fill_null(5), 5, id="filled"), + param(ibis.null().fill_null(5), 5, id="filled"), param(L(5).fill_null(10), 5, id="not_filled"), param(L(5).nullif(5), None, id="nullif_null"), param(L(10).nullif(5), 10, id="nullif_not_null"), @@ -180,8 +180,8 @@ def test_fill_null_nullif(con, expr, expected): ("expr", "expected"), [ param(ibis.coalesce(5, None, 4), 5, id="first"), - param(ibis.coalesce(ibis.NA, 4, ibis.NA), 4, id="second"), - param(ibis.coalesce(ibis.NA, ibis.NA, 3.14), 3.14, id="third"), + param(ibis.coalesce(ibis.null(), 4, ibis.null()), 4, id="second"), + param(ibis.coalesce(ibis.null(), ibis.null(), 3.14), 3.14, id="third"), ], ) def test_coalesce(con, expr, expected): @@ -191,12 +191,12 @@ def test_coalesce(con, expr, expected): @pytest.mark.parametrize( ("expr", "expected"), [ - param(ibis.coalesce(ibis.NA, ibis.NA), None, id="all_null"), + param(ibis.coalesce(ibis.null(), ibis.null()), None, id="all_null"), param( ibis.coalesce( - ibis.NA.cast("int8"), - ibis.NA.cast("int8"), - ibis.NA.cast("int8"), + ibis.null().cast("int8"), + ibis.null().cast("int8"), + ibis.null().cast("int8"), ), None, id="all_nulls_with_all_cast", @@ -208,7 +208,7 @@ def test_coalesce_all_na(con, expr, expected): def test_coalesce_all_na_double(con): - expr = ibis.coalesce(ibis.NA, ibis.NA, ibis.NA.cast("double")) + expr = ibis.coalesce(ibis.null(), ibis.null(), ibis.null().cast("double")) assert np.isnan(con.execute(expr)) @@ -595,7 +595,7 @@ def test_first_last_value(alltypes, df, func, expected_index): def test_null_column(alltypes): t = alltypes nrows = t.count().execute() - expr = t.mutate(na_column=ibis.NA).na_column + expr = t.mutate(na_column=ibis.null()).na_column result = expr.execute() tm.assert_series_equal(result, pd.Series([None] * nrows, name="na_column")) diff --git a/ibis/backends/sqlite/tests/test_client.py b/ibis/backends/sqlite/tests/test_client.py index cafea0abe3cd..d7d5def383b0 100644 --- a/ibis/backends/sqlite/tests/test_client.py +++ b/ibis/backends/sqlite/tests/test_client.py @@ -47,7 +47,7 @@ def test_builtin_agg_udf(con): def total(x) -> float: """Totally total.""" - expr = total(con.tables.functional_alltypes.limit(2).select(n=ibis.NA).n) + expr = total(con.tables.functional_alltypes.limit(2).select(n=ibis.null()).n) result = con.execute(expr) assert result == 0.0 diff --git a/ibis/backends/tests/sql/test_sql.py b/ibis/backends/tests/sql/test_sql.py index 1f9e95542e12..6f1d116374e9 100644 --- a/ibis/backends/tests/sql/test_sql.py +++ b/ibis/backends/tests/sql/test_sql.py @@ -121,7 +121,7 @@ def test_coalesce(functional_alltypes, snapshot): d = functional_alltypes.double_col f = functional_alltypes.float_col - expr = ibis.coalesce((d > 30).ifelse(d, ibis.NA), ibis.NA, f).name("tmp") + expr = ibis.coalesce((d > 30).ifelse(d, ibis.null()), ibis.null(), f).name("tmp") snapshot.assert_match(to_sql(expr.name("tmp")), "out.sql") diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 6db2eb4a68fa..cf7c1f1e3b69 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -118,7 +118,7 @@ def test_boolean_literal(con, backend): @pytest.mark.parametrize( ("expr", "expected"), [ - param(ibis.NA.fill_null(5), 5, id="na_fill_null"), + param(ibis.null().fill_null(5), 5, id="na_fill_null"), param(ibis.literal(5).fill_null(10), 5, id="non_na_fill_null"), param(ibis.literal(5).nullif(5), None, id="nullif_null"), param(ibis.literal(10).nullif(5), 10, id="nullif_not_null"), @@ -159,7 +159,10 @@ def test_scalar_fill_null_nullif(con, expr, expected): id="nan_col", ), param( - "none_col", ibis.NA.cast("float64"), methodcaller("isnull"), id="none_col" + "none_col", + ibis.null().cast("float64"), + methodcaller("isnull"), + id="none_col", ), ], ) @@ -224,8 +227,8 @@ def test_column_fill_null(backend, alltypes, value): ("expr", "expected"), [ param(ibis.coalesce(5, None, 4), 5, id="generic"), - param(ibis.coalesce(ibis.NA, 4, ibis.NA), 4, id="null_start_end"), - param(ibis.coalesce(ibis.NA, ibis.NA, 3.14), 3.14, id="non_null_last"), + param(ibis.coalesce(ibis.null(), 4, ibis.null()), 4, id="null_start_end"), + param(ibis.coalesce(ibis.null(), ibis.null(), 3.14), 3.14, id="non_null_last"), ], ) def test_coalesce(con, expr, expected): @@ -539,9 +542,9 @@ def test_drop_null_table(backend, alltypes, how, subset): is_four = alltypes.int_col == 4 table = alltypes.mutate( - col_1=is_two.ifelse(ibis.NA, alltypes.float_col), - col_2=is_four.ifelse(ibis.NA, alltypes.float_col), - col_3=(is_two | is_four).ifelse(ibis.NA, alltypes.float_col), + col_1=is_two.ifelse(ibis.null(), alltypes.float_col), + col_2=is_four.ifelse(ibis.null(), alltypes.float_col), + col_3=(is_two | is_four).ifelse(ibis.null(), alltypes.float_col), ).select("col_1", "col_2", "col_3") table_pandas = table.execute() @@ -931,7 +934,7 @@ def test_logical_negation_column(backend, alltypes, df, op): [("int64", 0, 1), ("float64", 0.0, 1.0)], ) def test_zero_ifnull_literals(con, dtype, zero, expected): - assert con.execute(ibis.NA.cast(dtype).fill_null(0)) == zero + assert con.execute(ibis.null().cast(dtype).fill_null(0)) == zero assert con.execute(ibis.literal(expected, type=dtype).fill_null(0)) == expected diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index 491efc46f281..e637fb19a8e0 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -606,7 +606,7 @@ def test_map_get_with_incompatible_value_different_kind(con): @mark_notimpl_risingwave_hstore @mark_notyet_postgres -@pytest.mark.parametrize("null_value", [None, ibis.NA]) +@pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_not_nullable(con, null_value): map_type = dt.Map(dt.string, dt.Int16(nullable=False)) value = ibis.literal({"A": 1000, "B": 2000}).cast(map_type) @@ -615,7 +615,7 @@ def test_map_get_with_null_on_not_nullable(con, null_value): assert pd.isna(result) -@pytest.mark.parametrize("null_value", [None, ibis.NA]) +@pytest.mark.parametrize("null_value", [None, ibis.null()]) @pytest.mark.notyet( ["flink"], raises=Py4JJavaError, reason="Flink cannot handle typeless nulls" ) diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index 7761253ebbeb..423b116e440d 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -923,7 +923,7 @@ def test_levenshtein(con, right): "expr", [ param(ibis.case().when(True, "%").end(), id="case"), - param(ibis.ifelse(True, "%", ibis.NA), id="ifelse"), + param(ibis.ifelse(True, "%", ibis.null()), id="ifelse"), ], ) def test_no_conditional_percent_escape(con, expr): diff --git a/ibis/backends/tests/test_struct.py b/ibis/backends/tests/test_struct.py index c791318f15d6..6a7429a6c2ff 100644 --- a/ibis/backends/tests/test_struct.py +++ b/ibis/backends/tests/test_struct.py @@ -73,7 +73,7 @@ def test_all_fields(struct, struct_df): _SIMPLE_DICT, type="struct", ) -_NULL_STRUCT_LITERAL = ibis.NA.cast("struct") +_NULL_STRUCT_LITERAL = ibis.null().cast("struct") @pytest.mark.notimpl(["postgres", "risingwave"]) diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index 88ad8e55ab15..b0684a278a5d 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -637,7 +637,7 @@ def test_simple_ungrouped_unbound_following_window( @pytest.mark.xfail_version(datafusion=["datafusion==35"]) def test_simple_ungrouped_window_with_scalar_order_by(alltypes): t = alltypes[alltypes.double_col < 50].order_by("id") - w = ibis.window(rows=(0, None), order_by=ibis.NA) + w = ibis.window(rows=(0, None), order_by=ibis.null()) expr = t.double_col.sum().over(w).name("double_col") # hard to reproduce this in pandas, so just test that it actually executes expr.execute() diff --git a/ibis/expr/api.py b/ibis/expr/api.py index 9eb8039bfb34..e99f4d8facb2 100644 --- a/ibis/expr/api.py +++ b/ibis/expr/api.py @@ -57,7 +57,6 @@ "Column", "Deferred", "Expr", - "NA", "Scalar", "Schema", "Table", @@ -197,35 +196,6 @@ pi = ops.Pi().to_expr() -NA = null() -"""The NULL scalar. - -This is an untyped NULL. If you want a typed NULL, use eg `ibis.null(str)`. - -Examples --------- ->>> import ibis ->>> ibis.options.interactive = True ->>> ibis.NA.isnull() -┌──────┐ -│ True │ -└──────┘ - -datatype-specific methods aren't available on `NA`: - ->>> ibis.NA.upper() # quartodoc: +EXPECTED_FAILURE -Traceback (most recent call last): - ... -AttributeError: 'NullScalar' object has no attribute 'upper' - -Instead, use the typed `ibis.null`: - ->>> ibis.null(str).upper().isnull() -┌──────┐ -│ True │ -└──────┘ -""" - deferred = _ """Deferred expression object. diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index be1a20cfa093..4dab6aee362a 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -569,7 +569,7 @@ def test_order_by_asc_deferred_sort_key(table): @pytest.mark.parametrize( ("key", "expected"), [ - param(ibis.NA, ibis.NA.op(), id="na"), + param(ibis.null(), ibis.null().op(), id="na"), param(rand, rand.op(), id="random"), param(1.0, ibis.literal(1.0).op(), id="float"), param(ibis.literal("a"), ibis.literal("a").op(), id="string"), diff --git a/ibis/tests/expr/test_timestamp.py b/ibis/tests/expr/test_timestamp.py index 454355458a62..45b8972d5dc4 100644 --- a/ibis/tests/expr/test_timestamp.py +++ b/ibis/tests/expr/test_timestamp.py @@ -106,7 +106,7 @@ def test_greater_comparison_pandas_timestamp(alltypes): def test_timestamp_precedence(): ts = ibis.literal(datetime.now()) - highest_type = rlz.highest_precedence_dtype([ibis.NA.op(), ts.op()]) + highest_type = rlz.highest_precedence_dtype([ibis.null().op(), ts.op()]) assert highest_type == dt.timestamp diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py index cdd60788db9f..e7b57376052a 100644 --- a/ibis/tests/expr/test_value_exprs.py +++ b/ibis/tests/expr/test_value_exprs.py @@ -352,7 +352,7 @@ def test_notnull(table): @pytest.mark.parametrize( "value", - [None, ibis.NA, ibis.literal(None, type="int32")], + [None, ibis.null(), ibis.literal(None, type="int32")], ids=["none", "NA", "typed-null"], ) def test_null_eq_and_ne(table, value): @@ -648,7 +648,7 @@ def test_or_(table): def test_null_column(): t = ibis.table([("a", "string")], name="t") - s = t.mutate(b=ibis.NA) + s = t.mutate(b=ibis.null()) assert s.b.type() == dt.null assert isinstance(s.b, ir.NullColumn) @@ -657,8 +657,8 @@ def test_null_column_union(): s = ibis.table([("a", "string"), ("b", "double")]) t = ibis.table([("a", "string")]) with pytest.raises(ibis.common.exceptions.RelationError): - s.union(t.mutate(b=ibis.NA)) # needs a type - assert s.union(t.mutate(b=ibis.NA.cast("double"))).schema() == s.schema() + s.union(t.mutate(b=ibis.null())) # needs a type + assert s.union(t.mutate(b=ibis.null().cast("double"))).schema() == s.schema() def test_string_compare_numeric_array(table): @@ -843,12 +843,12 @@ def test_substitute_dict(): ) assert_equal(result, expected) - result = table.foo.substitute(subs, else_=ibis.NA) + result = table.foo.substitute(subs, else_=ibis.null()) expected = ( ibis.case() .when(table.foo == "a", "one") .when(table.foo == "b", table.bar) - .else_(ibis.NA) + .else_(ibis.null()) .end() ) assert_equal(result, expected) @@ -926,7 +926,7 @@ def test_generic_value_api_no_arithmetic(value, operation): ("value", "expected"), [(5, dt.int8), (5.4, dt.double), ("abc", dt.string)] ) def test_fill_null_null(value, expected): - assert ibis.NA.fill_null(value).type().equals(expected) + assert ibis.null().fill_null(value).type().equals(expected) @pytest.mark.parametrize( @@ -1229,7 +1229,7 @@ def test_map_get_with_incompatible_value_different_kind(): assert value.get("C", 3.0).type() == dt.float64 -@pytest.mark.parametrize("null_value", [None, ibis.NA]) +@pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_not_nullable(null_value): map_type = dt.Map(dt.string, dt.Int16(nullable=False)) value = ibis.literal({"A": 1000, "B": 2000}).cast(map_type) @@ -1238,14 +1238,14 @@ def test_map_get_with_null_on_not_nullable(null_value): assert expr.type() == dt.Int16(nullable=True) -@pytest.mark.parametrize("null_value", [None, ibis.NA]) +@pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_nullable(null_value): value = ibis.literal({"A": 1000, "B": None}) result = value.get("C", null_value) assert result.type().nullable -@pytest.mark.parametrize("null_value", [None, ibis.NA]) +@pytest.mark.parametrize("null_value", [None, ibis.null()]) def test_map_get_with_null_on_null_type_with_null(null_value): value = ibis.literal({"A": None, "B": None}) result = value.get("C", null_value) @@ -1378,13 +1378,13 @@ def test_repr_list_of_lists_in_table(): @pytest.mark.parametrize( ("expr", "expected_type"), [ - (ibis.coalesce(ibis.NA, 1), dt.int8), - (ibis.coalesce(1, ibis.NA), dt.int8), - (ibis.coalesce(ibis.NA, 1000), dt.int16), - (ibis.coalesce(ibis.NA), dt.null), - (ibis.coalesce(ibis.NA, ibis.NA), dt.null), + (ibis.coalesce(ibis.null(), 1), dt.int8), + (ibis.coalesce(1, ibis.null()), dt.int8), + (ibis.coalesce(ibis.null(), 1000), dt.int16), + (ibis.coalesce(ibis.null()), dt.null), + (ibis.coalesce(ibis.null(), ibis.null()), dt.null), ( - ibis.coalesce(ibis.NA, ibis.NA.cast("array")), + ibis.coalesce(ibis.null(), ibis.null().cast("array")), dt.Array(dt.string), ), ], @@ -1508,14 +1508,14 @@ def test_deferred_r_ops(op_name, expected_left, expected_right): @pytest.mark.parametrize( ("expr_fn", "expected_type"), [ - (lambda t: ibis.ifelse(t.a == 1, t.b, ibis.NA), dt.string), + (lambda t: ibis.ifelse(t.a == 1, t.b, ibis.null()), dt.string), (lambda t: ibis.ifelse(t.a == 1, t.b, t.a.cast("string")), dt.string), ( lambda t: ibis.ifelse(t.a == 1, t.b, t.a.cast("!string")), dt.string.copy(nullable=False), ), - (lambda _: ibis.ifelse(True, ibis.NA, ibis.NA), dt.null), - (lambda _: ibis.ifelse(False, ibis.NA, ibis.NA), dt.null), + (lambda _: ibis.ifelse(True, ibis.null(), ibis.null()), dt.null), + (lambda _: ibis.ifelse(False, ibis.null(), ibis.null()), dt.null), ], ) def test_non_null_with_null_precedence(expr_fn, expected_type): diff --git a/ibis/tests/expr/test_window_frames.py b/ibis/tests/expr/test_window_frames.py index 2e88f2c2cac0..5560a3608501 100644 --- a/ibis/tests/expr/test_window_frames.py +++ b/ibis/tests/expr/test_window_frames.py @@ -234,7 +234,7 @@ def test_window_api_supports_value_expressions(t): def test_window_api_supports_scalar_order_by(t): - window = ibis.window(order_by=ibis.NA) + window = ibis.window(order_by=ibis.null()) expr = t.a.sum().over(window).op() expected = ops.WindowFunction( t.a.sum(), @@ -242,7 +242,7 @@ def test_window_api_supports_scalar_order_by(t): start=None, end=None, group_by=(), - order_by=(ibis.NA.op(),), + order_by=(ibis.null().op(),), ) assert expr == expected From cf34b9308d52be477ef4e54b636cbab614b823c2 Mon Sep 17 00:00:00 2001 From: ncclementi Date: Mon, 10 Jun 2024 13:11:55 -0400 Subject: [PATCH 2/5] test: add test for ibis NA deprecation --- ibis/tests/test_api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ibis/tests/test_api.py b/ibis/tests/test_api.py index fc672a4af8ed..b3c68a0ce3de 100644 --- a/ibis/tests/test_api.py +++ b/ibis/tests/test_api.py @@ -69,3 +69,8 @@ def test_no_import(module): assert "{module}" not in sys.modules """ subprocess.run([sys.executable, "-c", script], check=True) + + +def test_ibis_na_deprecation_warning(): + with pytest.warns(DeprecationWarning, match="'ibis.NA' is deprecated as of v9.1"): + ibis.NA # noqa: B018 From f665950a10312c9e47be53228c17fa5a5a322875 Mon Sep 17 00:00:00 2001 From: ncclementi Date: Mon, 10 Jun 2024 13:15:40 -0400 Subject: [PATCH 3/5] chore: remove unnecessary import --- ibis/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ibis/__init__.py b/ibis/__init__.py index 6bc98966502b..1ef64404a9e4 100644 --- a/ibis/__init__.py +++ b/ibis/__init__.py @@ -139,8 +139,7 @@ def __getattr__(name: str) -> Any: DeprecationWarning, stacklevel=2, ) - import ibis - return ibis.null() + return null() # noqa: F405 else: return load_backend(name) From dfdaf59c78a7bc62e8c31d46d6b8ffe4b97613a0 Mon Sep 17 00:00:00 2001 From: ncclementi Date: Mon, 10 Jun 2024 13:30:42 -0400 Subject: [PATCH 4/5] chore: fix typo in mtch msg in test --- ibis/tests/test_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ibis/tests/test_api.py b/ibis/tests/test_api.py index b3c68a0ce3de..0be84a07e522 100644 --- a/ibis/tests/test_api.py +++ b/ibis/tests/test_api.py @@ -72,5 +72,7 @@ def test_no_import(module): def test_ibis_na_deprecation_warning(): - with pytest.warns(DeprecationWarning, match="'ibis.NA' is deprecated as of v9.1"): + with pytest.warns( + DeprecationWarning, match="The 'ibis.NA' constant is deprecated as of v9.1" + ): ibis.NA # noqa: B018 From 5aa35722506ab9235ad74b42a6a3625a4139f1b7 Mon Sep 17 00:00:00 2001 From: Naty Clementi Date: Mon, 10 Jun 2024 15:43:27 -0400 Subject: [PATCH 5/5] chore: update test Co-authored-by: Jim Crist-Harif --- ibis/tests/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ibis/tests/test_api.py b/ibis/tests/test_api.py index 0be84a07e522..1a7f2933bc63 100644 --- a/ibis/tests/test_api.py +++ b/ibis/tests/test_api.py @@ -75,4 +75,4 @@ def test_ibis_na_deprecation_warning(): with pytest.warns( DeprecationWarning, match="The 'ibis.NA' constant is deprecated as of v9.1" ): - ibis.NA # noqa: B018 + assert ibis.NA is ibis.null()