Skip to content

Commit

Permalink
chore: update semantic join column reference syntax (#1102)
Browse files Browse the repository at this point in the history
* chore: update semantic join column reference syntax

* fix format

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* update wording

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
sycai and gcf-owl-bot[bot] authored Oct 23, 2024
1 parent 7094c85 commit 6b3ceaa
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 58 deletions.
27 changes: 17 additions & 10 deletions bigframes/operations/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,10 +462,10 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
An instruction on how left and right rows can be joined. This value must contain
column references by name. which should be wrapped in a pair of braces.
For example: "The {city} belongs to the {country}".
For column names that are shared between two dataframes, you need to add "_left"
and "_right" suffix for differentiation. This is especially important when you do
self joins. For example: "The {employee_name_left} reports to {employee_name_right}"
You must not add "_left" or "_right" suffix to non-overlapping columns.
For column names that are shared between two dataframes, you need to add "left."
and "right." prefix for differentiation. This is especially important when you do
self joins. For example: "The {left.employee_name} reports to {right.employee_name}"
For unique column names, this prefix is optional.
model:
A GeminiTextGenerator provided by Bigframes ML package.
Expand Down Expand Up @@ -503,27 +503,29 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
elif col in other.columns:
right_columns.append(col)

elif col.endswith("_left"):
original_col_name = col[: -len("_left")]
elif col.startswith("left."):
original_col_name = col[len("left.") :]
if (
original_col_name in self._df.columns
and original_col_name in other.columns
):
left_columns.append(col)
elif original_col_name in self._df.columns:
raise ValueError(f"Unnecessary suffix for {col}")
left_columns.append(col)
instruction = instruction.replace(col, original_col_name)
else:
raise ValueError(f"Column {col} not found")

elif col.endswith("_right"):
original_col_name = col[: -len("_right")]
elif col.startswith("right."):
original_col_name = col[len("right.") :]
if (
original_col_name in self._df.columns
and original_col_name in other.columns
):
right_columns.append(col)
elif original_col_name in other.columns:
raise ValueError(f"Unnecessary suffix for {col}")
right_columns.append(col)
instruction = instruction.replace(col, original_col_name)
else:
raise ValueError(f"Column {col} not found")

Expand All @@ -536,6 +538,11 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
if not right_columns:
raise ValueError("No right column references.")

# Update column references to be compatible with internal naming scheme.
# That is, "left.col" -> "col_left" and "right.col" -> "col_right"
instruction = re.sub(r"(?<!{){left\.(\w+)}(?!})", r"{\1_left}", instruction)
instruction = re.sub(r"(?<!{){right\.(\w+)}(?!})", r"{\1_right}", instruction)

joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))

return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
Expand Down
55 changes: 23 additions & 32 deletions notebooks/experimental/semantic_operators.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
{
"data": {
"text/html": [
"Query job cdc57da6-3849-4e40-b2a4-0d678872c8a6 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:cdc57da6-3849-4e40-b2a4-0d678872c8a6&page=queryresults\">Open Job</a>"
"Query job aadc79c5-5402-4922-a694-adb3848e3193 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:aadc79c5-5402-4922-a694-adb3848e3193&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -138,7 +138,7 @@
{
"data": {
"text/html": [
"Query job df036c3e-4557-44a3-a3fd-c592a7f16ee3 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:df036c3e-4557-44a3-a3fd-c592a7f16ee3&page=queryresults\">Open Job</a>"
"Query job 16757654-a541-47ed-ac48-84b4b549f3bd is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:16757654-a541-47ed-ac48-84b4b549f3bd&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -190,7 +190,7 @@
{
"data": {
"text/html": [
"Query job 263b35a1-4f93-4d20-bc49-7c3736c6629d is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:263b35a1-4f93-4d20-bc49-7c3736c6629d&page=queryresults\">Open Job</a>"
"Query job 2640c2d3-ceb0-4f8a-8bb2-a3ec5b3c8eb8 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:2640c2d3-ceb0-4f8a-8bb2-a3ec5b3c8eb8&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -284,7 +284,7 @@
{
"data": {
"text/html": [
"Query job 98f877ed-6a40-4b6c-84cf-43f3ea4d29c9 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:98f877ed-6a40-4b6c-84cf-43f3ea4d29c9&page=queryresults\">Open Job</a>"
"Query job 0aeff7d2-c2f3-45f1-8f8f-5be572392822 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:0aeff7d2-c2f3-45f1-8f8f-5be572392822&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -304,7 +304,7 @@
{
"data": {
"text/html": [
"Query job b0a9ac43-9be2-4b37-9bea-267406d1bd65 is DONE. 6 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b0a9ac43-9be2-4b37-9bea-267406d1bd65&page=queryresults\">Open Job</a>"
"Query job 0f3bfbe3-30c5-4bf9-8cf2-a3ea694f878d is DONE. 6 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:0f3bfbe3-30c5-4bf9-8cf2-a3ea694f878d&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -316,7 +316,7 @@
{
"data": {
"text/html": [
"Query job af163e99-1771-4d1f-841a-9aeb45936fd6 is DONE. 50 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:af163e99-1771-4d1f-841a-9aeb45936fd6&page=queryresults\">Open Job</a>"
"Query job dfef2ffb-7196-4c94-bcae-d050021ebb5f is DONE. 50 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:dfef2ffb-7196-4c94-bcae-d050021ebb5f&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -328,7 +328,7 @@
{
"data": {
"text/html": [
"Query job bef74461-ba0e-4a4d-8158-7cdb1ddf993c is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:bef74461-ba0e-4a4d-8158-7cdb1ddf993c&page=queryresults\">Open Job</a>"
"Query job f6397298-dfe2-4e9d-ab91-e790c04ccddc is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f6397298-dfe2-4e9d-ab91-e790c04ccddc&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -418,7 +418,7 @@
{
"data": {
"text/html": [
"Query job 18101b3e-f11a-44fd-971a-0c9c35df03ad is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:18101b3e-f11a-44fd-971a-0c9c35df03ad&page=queryresults\">Open Job</a>"
"Query job 2d6b0f90-20b3-419e-8d38-68be9e7ee1ae is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:2d6b0f90-20b3-419e-8d38-68be9e7ee1ae&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -510,7 +510,7 @@
{
"data": {
"text/html": [
"Query job fee13cd3-657c-4d89-8c0a-0ba006ad1751 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:fee13cd3-657c-4d89-8c0a-0ba006ad1751&page=queryresults\">Open Job</a>"
"Query job a90d785f-29d7-4818-b595-9326657cc865 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a90d785f-29d7-4818-b595-9326657cc865&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -530,7 +530,7 @@
{
"data": {
"text/html": [
"Query job eb38238a-868e-4769-81fd-6614e9ef46d1 is DONE. 6 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:eb38238a-868e-4769-81fd-6614e9ef46d1&page=queryresults\">Open Job</a>"
"Query job de201200-4135-487b-8582-12d5137ddc24 is DONE. 6 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:de201200-4135-487b-8582-12d5137ddc24&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -542,7 +542,7 @@
{
"data": {
"text/html": [
"Query job 7adaa9c1-13c5-4600-8be9-f204ed9764f1 is DONE. 52 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7adaa9c1-13c5-4600-8be9-f204ed9764f1&page=queryresults\">Open Job</a>"
"Query job ec36bc3a-773b-4443-be63-72e4c1063168 is DONE. 52 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:ec36bc3a-773b-4443-be63-72e4c1063168&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -554,7 +554,7 @@
{
"data": {
"text/html": [
"Query job c2342d4d-cfc6-4f68-9c83-b56ae01c7d4d is DONE. 133 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:c2342d4d-cfc6-4f68-9c83-b56ae01c7d4d&page=queryresults\">Open Job</a>"
"Query job ec204378-936a-4dc9-8e4a-5d4b17caad78 is DONE. 133 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:ec204378-936a-4dc9-8e4a-5d4b17caad78&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -680,7 +680,7 @@
{
"data": {
"text/html": [
"Query job 39494a96-c1a2-4708-94e8-c87d4818dbc6 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:39494a96-c1a2-4708-94e8-c87d4818dbc6&page=queryresults\">Open Job</a>"
"Query job d1195b35-ca65-474c-9847-8e315447a941 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d1195b35-ca65-474c-9847-8e315447a941&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -700,7 +700,7 @@
{
"data": {
"text/html": [
"Query job f55e46ba-4453-4fb3-9a9a-da6b769ed3a6 is DONE. 30 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f55e46ba-4453-4fb3-9a9a-da6b769ed3a6&page=queryresults\">Open Job</a>"
"Query job 721d1ef5-810e-4d0f-bb4a-250dc12bbe82 is DONE. 30 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:721d1ef5-810e-4d0f-bb4a-250dc12bbe82&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -712,7 +712,7 @@
{
"data": {
"text/html": [
"Query job 090121f4-0654-43e2-a62b-0d2e77659950 is DONE. 251 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:090121f4-0654-43e2-a62b-0d2e77659950&page=queryresults\">Open Job</a>"
"Query job 54502b90-fea2-4545-8542-a6be6f6837ac is DONE. 251 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:54502b90-fea2-4545-8542-a6be6f6837ac&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -724,7 +724,7 @@
{
"data": {
"text/html": [
"Query job 75e94a17-45bc-417f-8647-f3e74a52ada5 is DONE. 144 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:75e94a17-45bc-417f-8647-f3e74a52ada5&page=queryresults\">Open Job</a>"
"Query job 00326a4e-2371-4572-a842-e14ba99ac02c is DONE. 144 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:00326a4e-2371-4572-a842-e14ba99ac02c&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -821,7 +821,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We use a self-join example to demonstrate a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"_left\" and \"_right\" suffixes to your column names. \n",
"We use a self-join example to demonstrate a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"left.\" and \"right.\" prefixes to your column names. \n",
"\n",
"Let's create an example data frame:"
]
Expand All @@ -839,7 +839,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, we use `animal_left` and `animal_right` to differentiate the data sources:"
"We want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, we use `left.animal` and `right.animal` to differentiate the data sources:"
]
},
{
Expand All @@ -850,7 +850,7 @@
{
"data": {
"text/html": [
"Query job 9dcd66e6-2354-4ee1-8d77-b94a66facc31 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:9dcd66e6-2354-4ee1-8d77-b94a66facc31&page=queryresults\">Open Job</a>"
"Query job 2a87f5a4-927d-472f-808d-bc86e008dbaf is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:2a87f5a4-927d-472f-808d-bc86e008dbaf&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -870,7 +870,7 @@
{
"data": {
"text/html": [
"Query job 01a2cff2-b4ba-4aa2-8126-d4754bccda12 is DONE. 32 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:01a2cff2-b4ba-4aa2-8126-d4754bccda12&page=queryresults\">Open Job</a>"
"Query job 591da923-319f-43f9-bf24-20e34abf899f is DONE. 32 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:591da923-319f-43f9-bf24-20e34abf899f&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -882,7 +882,7 @@
{
"data": {
"text/html": [
"Query job 35b2999b-207b-40b0-9cc1-425a9a68707a is DONE. 266 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:35b2999b-207b-40b0-9cc1-425a9a68707a&page=queryresults\">Open Job</a>"
"Query job 1422bfcf-23a9-4484-96d7-16d5b56dfe26 is DONE. 266 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1422bfcf-23a9-4484-96d7-16d5b56dfe26&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -894,7 +894,7 @@
{
"data": {
"text/html": [
"Query job cb839bc6-0f13-4b66-a462-a440a9a162c8 is DONE. 180 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:cb839bc6-0f13-4b66-a462-a440a9a162c8&page=queryresults\">Open Job</a>"
"Query job a659dcaf-9bd4-4776-bb82-c0d3408b41a2 is DONE. 180 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a659dcaf-9bd4-4776-bb82-c0d3408b41a2&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -982,16 +982,7 @@
}
],
"source": [
"animals.semantics.join(animals, \"{animal_left} generally weighs heavier than {animal_right}\", model=gemini_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that you should not attach \"_left\" or \"_right\" suffixes to non-overlapping columns, otherwise you will get an error.\n",
"\n",
"There is an additional concern on this naming scheme: it risks collisions with existing column names that end with \"_left\" or \"_right\", so we need more efforts to find a better solution."
"animals.semantics.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)"
]
},
{
Expand Down
Loading

0 comments on commit 6b3ceaa

Please sign in to comment.