diff --git a/.devcontainer/updateContent.sh b/.devcontainer/updateContent.sh index fa78393b0f731..79af8c951f6b4 100755 --- a/.devcontainer/updateContent.sh +++ b/.devcontainer/updateContent.sh @@ -2,4 +2,4 @@ # install ibis python3 -m pip install ipython -python3 -m pip install -e '.[duckdb,examples]' +python3 -m pip install -e '.[duckdb,clickhouse,examples]' diff --git a/docs/how-to/analytics/imdb.qmd b/docs/how-to/analytics/imdb.qmd new file mode 100644 index 0000000000000..2cf32ae0f45ff --- /dev/null +++ b/docs/how-to/analytics/imdb.qmd @@ -0,0 +1,276 @@ +--- +title: Analyze IMDB data using Ibis +--- + +Let's use the Ibis examples module and the DuckDB backend to find some movies +to watch. + +Adapted from [Phillip in the Cloud's livestream using the same +data](https://www.youtube.com/watch?v=J7sEn9VklKY). + +## Imports + +For this example, we'll just use Ibis. + +```{python} +from ibis.interactive import * # <1> +``` + +1. This import imports `ibis.examples` as `ex`. + +## Fetch the example data + +We can use the `ibis.examples` module to fetch the IMDB data. Ibis +automatically caches the data on disk so subsequent runs don't require fetching +from cloud storage on each call to `fetch`. + +```{python} +name_basics = ex.imdb_name_basics.fetch() +name_basics +``` + +To ensure column names are Pythonic, we can rename as `snake_case`. + +```{python} +name_basics.rename("snake_case") +``` + +Let's grab all of the relevant IMDB tables and rename columns. + +```{python} +name_basics = ex.imdb_name_basics.fetch().rename("snake_case") +title_akas = ex.imdb_title_akas.fetch().rename("snake_case") +title_basics = ex.imdb_title_basics.fetch().rename("snake_case") +title_crew = ex.imdb_title_crew.fetch().rename("snake_case") +title_episode = ex.imdb_title_episode.fetch().rename("snake_case") +title_principals = ex.imdb_title_principals.fetch().rename("snake_case") +title_ratings = ex.imdb_title_ratings.fetch().rename("snake_case") +``` + +## Preview the data + +We'll print out the first few rows of each table to get an idea of what is +contained in each. + +```{python} +name_basics.head() +``` + +```{python} +title_akas.head() +``` + +```{python} +title_basics.head() +``` + +```{python} +title_crew.head() +``` + +```{python} +title_episode.head() +``` + +```{python} +title_principals.head() +``` + +```{python} +title_ratings.head() +``` + +## Basic data exploration + +Let's check how many records are in each table. It's just Python, so we can +construct a dictionary and iterate through it in a for loop. + +```{python} +tables = { + "name_basics": name_basics, + "title_akas": title_akas, + "title_basics": title_basics, + "title_crew": title_crew, + "title_episode": title_episode, + "title_principals": title_principals, + "title_ratings": title_ratings, +} +max_name_len = max(map(len, tables.keys())) + 1 +``` + +```{python} +print("Length of tables:") +for t in tables: + print(f"\t{t.ljust(max_name_len)}: {tables[t].count().to_pandas():,}") +``` + +## Clean data + +Looking at the data, the `nconst` and `tconst` columns seem to be unique +identifiers. Let's confirm and adjust them accordingly. + +```{python} +name_basics.head() +``` + +Check the number of unique `nconst` values. + +```{python} +name_basics.nconst.nunique() +``` + +Confirm it's equal to the number of rows. + +```{python} +name_basics.nconst.nunique() == name_basics.count() +``` + +Mutate the table to convert `nconst` to an integer. + +```{python} +t = name_basics.mutate(nconst=_.nconst.replace("nm", "").cast("int")) +t.head() +``` + +Let's also turn `primary_profession` into an array of strings instead of +a single comma-separated string. + +```{python} +t = t.mutate(primary_profession=_.primary_profession.split(",")) +t +``` + +And, combining the two concepts, convert `known_for_titles` into an array of +integers corresponding to `tconst` identifiers. + +```{python} +t = t.mutate( + known_for_titles=_.known_for_titles.split(",").map( + lambda tconst: tconst.replace("tt", "").cast("int") + ) +) +t +``` + +## DRY-ing up the code + +We can define functions to convert `nconst` and `tconst` to integers. + +```{python} +def nconst_to_int(nconst): + return nconst.replace("nm", "").cast("int") + + +def tconst_to_int(tconst): + return tconst.replace("tt", "").cast("int") +``` + +Then combine the previous data cleansing in a single mutate call. + +```{python} +name_basics = name_basics.mutate( + nconst=nconst_to_int(_.nconst), + primary_profession=_.primary_profession.split(","), + known_for_titles=_.known_for_titles.split(",").map(tconst_to_int), +) +name_basics +``` + +We can use `ibis.to_sql` to see the SQL this generates. + +```{python} +ibis.to_sql(name_basics) +``` + +Clean the rest of the tables. We'll convert `nconst` and `tconst` columns +consistently to allow for easy joining. + +```{python} +title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).rename( + tconst="title_id" +) +title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst)) +title_crew = title_crew.mutate( + tconst=tconst_to_int(_.tconst), + directors=_.directors.split(",").map(nconst_to_int), + writers=_.writers.split(",").map(nconst_to_int), +) +title_episode = title_episode.mutate( + tconst=tconst_to_int(_.tconst), parent_tconst=tconst_to_int(_.parent_tconst) +) +title_principals = title_principals.mutate( + tconst=tconst_to_int(_.tconst), nconst=nconst_to_int(_.nconst) +) +title_ratings = title_ratings.mutate(tconst=tconst_to_int(_.tconst)) +``` + +## Finding good (and bad) movies to watch + +Join the IMDB rankings with information about the movies. + +```{python} +joined = title_basics.join(title_ratings, "tconst") +joined +``` + +```{python} +joined.title_type.value_counts().order_by(_.title_type_count.desc()) +``` + +Filter down to movies. + +```{python} +joined = joined.filter(_.title_type == "movie") +joined +``` + +Reorder the columns and drop some. + +```{python} +joined = joined.select( + "tconst", + "primary_title", + "average_rating", + "num_votes", + "genres", + "runtime_minutes", +) +joined +``` + +Sort by the average rating. + +```{python} +joined = joined.order_by([_.average_rating.desc(), _.num_votes.desc()]) +joined +``` + +A lot of 10/10 movies I haven't heard of … let's filter to movies with at least +`N` votes. + +```{python} +N = 50000 +joined = joined.filter(_.num_votes > N) +joined +``` + +What if you're in the mood for a bad movie? + +```{python} +joined = joined.order_by([_.average_rating.asc(), _.num_votes.desc()]) +joined +``` + +And specifically a bad comedy? + +```{python} +joined = joined.filter(_.genres.contains("Comedy")) +joined +``` + +Perfect! + +## Next Steps + +We only used two of the IMDB tables. What else can we do with the rest of the +data? Play around and let us know! diff --git a/docs/how-to/input-output/duckdb-parquet.qmd b/docs/how-to/input-output/duckdb-parquet.qmd new file mode 100644 index 0000000000000..51c0cd6171f30 --- /dev/null +++ b/docs/how-to/input-output/duckdb-parquet.qmd @@ -0,0 +1,96 @@ +--- +title: Read parquet files with Ibis +--- + +In this example, we will use Ibis's DuckDB backend to analyze data from +a remote parquet source using `ibis.read_parquet`. `ibis.read_parquet` can also +read local parquet files, and there are other `ibis.read_*` functions that +conveniently return a table expression from a file. One such function is +`ibis.read_csv`, which reads from local and remote CSV. + +We will be reading from the [**Global Biodiversity Information Facility (GBIF) +Species Occurrences**](https://registry.opendata.aws/gbif/) dataset. It is +hosted on S3 at `s3://gbif-open-data-us-east-1/occurrence/` + +## Reading One Partition + +We can read a single partition by specifying its path. + +We do this by calling +[`read_parquet`](https://ibis-project.org/api/expressions/top_level/#ibis.read_parquet) +on the partition we care about. + +So to read the first partition in this dataset, we'll call `read_parquet` on +`00000` in that path: + +```{python} +import ibis + +t = ibis.read_parquet( + "s3://gbif-open-data-us-east-1/occurrence/2023-04-01/occurrence.parquet/000000" +) +t +``` + +Note that we're calling `read_parquet` and receiving a table expression without +establishing a connection first. Ibis spins up a DuckDB connection (or +whichever default backend you have) when you call `ibis.read_parquet` (or even +`ibis.read_csv`). + +Since our result, `t`, is a table expression, we can now run queries against +the file using Ibis expressions. For example, we can select columns, filter the +file, and then view the first five rows of the result: + +```{python} +cols = [ + "gbifid", + "datasetkey", + "occurrenceid", + "kingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "day", + "month", + "year", +] + +t.select(cols).filter(t["family"].isin(["Corvidae"])).limit(5).to_pandas() +``` + +We can count the rows in the table (partition): + +```{python} +t.count().to_pandas() +``` + +## Reading all partitions: filter, aggregate, export + +We can use `read_parquet` to read an entire parquet file by globbing all +partitions: + +```{python} +t = ibis.read_parquet( + "s3://gbif-open-data-us-east-1/occurrence/2023-04-01/occurrence.parquet/*" +) +``` + +Since the function returns a table expression, we can perform valid selections, +filters, aggregations, and exports just as we could with any other table +expression: + +```{python} +df = ( + t.select(["gbifid", "family", "species"]) + .filter(t["family"].isin(["Corvidae"])) + # Here we limit by 10,000 to fetch a quick batch of results + .limit(10000) + .group_by("species") + .count() + .to_pandas() +) +df +``` diff --git a/docs/tutorials/data-platforms/clickhouse.qmd b/docs/tutorials/data-platforms/clickhouse.qmd new file mode 100644 index 0000000000000..1f27921ac84e1 --- /dev/null +++ b/docs/tutorials/data-platforms/clickhouse.qmd @@ -0,0 +1,142 @@ +--- +title: ClickHouse +--- + +[Ibis](https://ibis-project.com) supports reading and querying data using +[ClickHouse](https://clickhouse.com/) as a backend. + +In this example we'll demonstrate using Ibis to connect to a ClickHouse server, +and executing a few queries. + +```{python} +from ibis.interactive import * +``` + +## Creating a Connection + +First we need to connect Ibis to a running ClickHouse server. + +In this example we'll run queries against the publicly available [ClickHouse +playground](https://clickhouse.com/docs/en/getting-started/playground) server. + +To run against your own ClickHouse server you'd only need to change the +connection details. + +```{python} +con = ibis.connect("clickhouse://play@play.clickhouse.com:443") +``` + +## Listing available tables + +The ClickHouse playground server has a number of interesting datasets +available. To see them, we can examine the tables via the `.tables` attribute. + +This shows a list of all tables available: + +```{python} +con.tables +``` + +## Inspecting a Table + +Lets take a look at the `hackernews` table. This table contains all posts and +comments on [Hacker News](https://news.ycombinator.com/). + +We can access the table by attribute as `con.tables.hackernews`. + +```{python} +t = con.tables.hackernews +``` + +We can then take a peak at the first few rows using the `.head()` method. + +```{python} +t.head() +``` + +## Finding the highest scoring posts + +Here we find the top 5 posts by score. + +Posts have a title, so we: + +- `filter` out rows that lack a title +- `select` only the columns we're interested in +- `order` them by score, descending +- `limit` to the top 5 rows + +```{python} +top_posts_by_score = ( + t.filter(_.title != "") + .select("title", "score") + .order_by(ibis.desc("score")) + .limit(5) +) + +top_posts_by_score +``` + +## Finding the most prolific commenters + +Here we find the top 5 commenters by number of comments made. + +To do this we: + +- `filter` out rows with no author +- `group_by` author +- `count` all the rows in each group +- `order_by` the counts, descending +- `limit` to the top 5 rows + +```{python} +top_commenters = ( + t.filter(_.by != "") + .group_by("by") + .agg(count=_.count()) + .order_by(ibis.desc("count")) + .limit(5) +) + +top_commenters +``` + +This query could also be expressed using the `.topk` method, which is +a shorthand for the above: + +```{python} +# This is a shorthand for the above +top_commenters = t.filter(_.by != "").by.topk(5) + +top_commenters +``` + +## Finding top commenters by score + +Here we find the top 5 commenters with the highest cumulative scores. In this +case the `.topk` shorthand won't work and we'll need to write out the full +`group_by` -> `agg` -> `order_by` -> `limit` pipeline. + +```{python} +top_commenters_by_score = ( + t.filter(_.by != "") + .group_by("by") + .agg(total_score=_.score.sum()) + .order_by(ibis.desc("total_score")) + .limit(5) +) + +top_commenters_by_score +``` + +## Next Steps + +There are lots of other interesting queries one might ask of this dataset. + +A few examples: + +- What posts had the most comments? +- How do post scores fluctuate over time? +- What day of the week has the highest average post score? What day has the lowest? + +To learn more about how to use Ibis with Clickhouse, see [the +documentation](https://ibis-project.org/backends/ClickHouse/).