kraina-ai · Repcak2000 · Mar 15, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: conventional-pre-commit
         stages: [commit-msg]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.3.7'
+    rev: 'v0.4.1'
     hooks:
       - id: ruff
         types_or: [ python, pyi, jupyter ]
@@ -28,7 +28,7 @@ repos:
         args: ["--config-file", "pyproject.toml"]
         additional_dependencies: ['types-requests', 'types-six']
   - repo: https://github.com/pdm-project/pdm
-    rev: 2.14.0
+    rev: 2.15.0
     hooks:
       - id: pdm-lock-check
       - id: pdm-export

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+### Added
+
+- Initial implementation of datasets [#430](https://github.com/kraina-ai/srai/pull/430) for feature enrichment and benchmarking.
+
 ## [0.7.3] - 2024-04-21
 
 ### Changed

diff --git a/examples/base_models/regression_model.ipynb b/examples/base_models/regression_model.ipynb
@@ -0,0 +1,306 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import geopandas as gpd\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from shapely.geometry import Polygon\n",
+    "\n",
+    "from srai.datasets import AirbnbMulticityDataset\n",
+    "from srai.h3 import h3_to_geoseries\n",
+    "from srai.models import Evaluator, Predictor, RegressionBaseModel, Trainer, Vectorizer\n",
+    "from srai.plotting import plot_numeric_data\n",
+    "from srai.regionalizers import H3Regionalizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "airbnb = AirbnbMulticityDataset()\n",
+    "gdf_airbnb = airbnb.load(os.getenv(\"HF_TOKEN\"))\n",
+    "gdf_airbnb = gdf_airbnb.loc[gdf_airbnb[\"city\"].isin([\"paris\"])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "resolution = 8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_gdf, test_gdf = airbnb.train_test_split_bucket_regression(gdf_airbnb)\n",
+    "train_gdf, dev_gdf = airbnb.train_test_split_bucket_regression(train_gdf)  # get dev set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_gdf, test_gdf = airbnb.train_test_split_spatial_points(gdf_airbnb)\n",
+    "train_gdf, dev_gdf = airbnb.train_test_split_spatial_points(train_gdf)  # get dev set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = Vectorizer(\n",
+    "    gdf_train=train_gdf,\n",
+    "    HF_dataset_object=airbnb,\n",
+    "    embedder_type=\"Hex2VecEmbedder\",\n",
+    "    h3_resolution=resolution,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_train = vectorizer.get_dataset(train_gdf)\n",
+    "embedding_size = dataset_airbnb_train[\"X\"].shape[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(train_gdf.shape[0] + test_gdf.shape[0] + dev_gdf.shape[0]) == gdf_airbnb.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_test = vectorizer.get_dataset(test_gdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_dev = vectorizer.get_dataset(dev_gdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_dev"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_airbnb_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(dataset_airbnb_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regression_model = RegressionBaseModel(embedding_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loss_fn = nn.L1Loss()\n",
+    "optimizer = optim.Adam(regression_model.parameters(), lr=0.001)\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "args = {\n",
+    "    \"batch_size\": 32,\n",
+    "    \"task\": \"regression\",\n",
+    "    \"epochs\": 50,\n",
+    "    \"device\": device,\n",
+    "    \"metric2look4\": \"MAE\",\n",
+    "}\n",
+    "trainer = Trainer(\n",
+    "    model=regression_model,\n",
+    "    train_dataset=dataset_airbnb_train,\n",
+    "    eval_dataset=dataset_airbnb_dev,\n",
+    "    optimizer=optimizer,\n",
+    "    loss_fn=loss_fn,\n",
+    "    **args,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, _ = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator = Evaluator(task=\"regression\", device=device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator.evaluate(model, dataset_airbnb_test, return_metrics=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictor = Predictor(\"regression\", device=device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, hexes, values = predictor.predict(model, dataset_airbnb_test, resolution=resolution)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_label = [dataset_airbnb_test[i][\"y\"] for i in range(len(dataset_airbnb_test))]\n",
+    "original_hexes = [dataset_airbnb_test[i][\"X_h3_idx\"] for i in range(len(dataset_airbnb_test))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "polygons = h3_to_geoseries(\n",
+    "    hexes,\n",
+    ")\n",
+    "preds_gdf = gpd.GeoDataFrame(geometry=polygons)\n",
+    "preds_gdf.crs = {\"init\": \"epsg:4326\"}\n",
+    "preds_gdf[\"price\"] = [tensor.item() for tensor in values]\n",
+    "preds_gdf[\"region_id\"] = hexes\n",
+    "preds_gdf.index = preds_gdf[\"region_id\"]\n",
+    "\n",
+    "original_polygons = h3_to_geoseries(original_hexes)\n",
+    "original_gdf = gpd.GeoDataFrame(geometry=[Polygon(polygon) for polygon in original_polygons])\n",
+    "original_gdf.crs = {\"init\": \"epsg:4326\"}\n",
+    "original_gdf[\"price\"] = [tensor.item() for tensor in original_label]\n",
+    "original_gdf[\"region_id\"] = original_hexes\n",
+    "original_gdf.index = original_gdf[\"region_id\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regionalizer = H3Regionalizer(resolution=resolution)\n",
+    "regions = regionalizer.transform(original_gdf)\n",
+    "plot_numeric_data(regions, \"price\", original_gdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_numeric_data(regions, \"price\", preds_gdf)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}