From 6156f910facb5b53ca9f908acdeae306cf738104 Mon Sep 17 00:00:00 2001
From: Frost Ming <me@frostming.com>
Date: Thu, 21 Dec 2023 09:18:26 +0800
Subject: [PATCH] feat: reorganize the new SDK package (#4337)

* feat: reorganize the new SDK package

Signed-off-by: Frost Ming <me@frostming.com>

* feat: export types

Signed-off-by: Frost Ming <me@frostming.com>

* fix: sort imports

Signed-off-by: Frost Ming <me@frostming.com>

* fix: use real types

Signed-off-by: Frost Ming <me@frostming.com>

* fixup

Signed-off-by: Frost Ming <me@frostming.com>

* feat: expose clients

Signed-off-by: Frost Ming <me@frostming.com>
---
 examples/quickstart/model_pipeline.py         |   7 +-
 examples/quickstart/service.py                |   3 +-
 examples/sentence-embedding/service.py        |  10 +-
 pyproject.toml                                |   6 +-
 .../worker => _bentoml_impl}/__init__.py      |   0
 src/{bentoml_io => _bentoml_impl}/arrow.py    |   2 +-
 .../client/__init__.py                        |   0
 .../client/base.py                            |   0
 .../client/http.py                            |  99 ++++----
 .../client/proxy.py                           |   4 +-
 src/{bentoml_io => _bentoml_impl}/serde.py    |   4 +-
 .../server/.gitignore                         |   0
 .../server/__init__.py                        |   8 +-
 .../server/allocator.py                       |   3 +-
 .../server/app.py                             | 163 +++++++------
 .../server/main-openapi.html                  |   0
 .../server/main-ui.html                       |   0
 .../server/serving.py                         |   7 +-
 src/_bentoml_impl/worker/__init__.py          |   0
 .../worker/service.py                         |   6 +-
 src/_bentoml_sdk/__init__.py                  |  34 +++
 src/{bentoml_io => _bentoml_sdk}/_pydantic.py |  90 ++++++--
 src/{bentoml_io => _bentoml_sdk}/api.py       |   5 +-
 src/{bentoml_io => _bentoml_sdk}/io_models.py |  51 ++--
 .../types.py => _bentoml_sdk/schema.py}       | 217 +++++-------------
 src/_bentoml_sdk/service/__init__.py          |   7 +
 .../service}/config.py                        |   0
 .../service}/dependency.py                    |   6 +-
 .../service}/factory.py                       |   9 +-
 .../service}/openapi.py                       |   0
 .../typing_utils.py                           |   0
 src/bentoml/__init__.py                       |  48 ++++
 src/bentoml/_internal/bento/bento.py          |  20 +-
 src/bentoml/_internal/cloud/schemas.py        |   2 +-
 src/bentoml/_internal/context.py              |  23 +-
 src/bentoml/_internal/service/loader.py       |   4 +-
 src/bentoml/_internal/service/service.py      |   2 +-
 .../_internal/utils/analytics/usage_stats.py  |   2 +-
 src/bentoml/bentos.py                         |   2 +-
 src/bentoml/server.py                         |  21 +-
 src/bentoml/types.py                          |   2 +
 src/bentoml_cli/serve.py                      |   2 +-
 src/bentoml_cli/start.py                      |   2 +-
 src/bentoml_io/__init__.py                    |  22 --
 tests/unit/bentoml_cli/test_env_manager.py    |   1 +
 tests/unit/bentoml_io/test_allocator.py       |  25 +-
 webui/scripts/copy-lib.cjs                    |   2 +-
 webui/src/components/code/Python.tsx          |  12 +-
 48 files changed, 500 insertions(+), 433 deletions(-)
 rename src/{bentoml_io/worker => _bentoml_impl}/__init__.py (100%)
 rename src/{bentoml_io => _bentoml_impl}/arrow.py (98%)
 rename src/{bentoml_io => _bentoml_impl}/client/__init__.py (100%)
 rename src/{bentoml_io => _bentoml_impl}/client/base.py (100%)
 rename src/{bentoml_io => _bentoml_impl}/client/http.py (86%)
 rename src/{bentoml_io => _bentoml_impl}/client/proxy.py (97%)
 rename src/{bentoml_io => _bentoml_impl}/serde.py (97%)
 rename src/{bentoml_io => _bentoml_impl}/server/.gitignore (100%)
 rename src/{bentoml_io => _bentoml_impl}/server/__init__.py (51%)
 rename src/{bentoml_io => _bentoml_impl}/server/allocator.py (99%)
 rename src/{bentoml_io => _bentoml_impl}/server/app.py (72%)
 rename src/{bentoml_io => _bentoml_impl}/server/main-openapi.html (100%)
 rename src/{bentoml_io => _bentoml_impl}/server/main-ui.html (100%)
 rename src/{bentoml_io => _bentoml_impl}/server/serving.py (98%)
 create mode 100644 src/_bentoml_impl/worker/__init__.py
 rename src/{bentoml_io => _bentoml_impl}/worker/service.py (97%)
 create mode 100644 src/_bentoml_sdk/__init__.py
 rename src/{bentoml_io => _bentoml_sdk}/_pydantic.py (58%)
 rename src/{bentoml_io => _bentoml_sdk}/api.py (98%)
 rename src/{bentoml_io => _bentoml_sdk}/io_models.py (85%)
 rename src/{bentoml_io/types.py => _bentoml_sdk/schema.py} (61%)
 create mode 100644 src/_bentoml_sdk/service/__init__.py
 rename src/{bentoml_io => _bentoml_sdk/service}/config.py (100%)
 rename src/{bentoml_io => _bentoml_sdk/service}/dependency.py (90%)
 rename src/{bentoml_io => _bentoml_sdk/service}/factory.py (98%)
 rename src/{bentoml_io => _bentoml_sdk/service}/openapi.py (100%)
 rename src/{bentoml_io => _bentoml_sdk}/typing_utils.py (100%)
 delete mode 100644 src/bentoml_io/__init__.py

diff --git a/examples/quickstart/model_pipeline.py b/examples/quickstart/model_pipeline.py
index 1900470e5f4..1584d9efc2e 100644
--- a/examples/quickstart/model_pipeline.py
+++ b/examples/quickstart/model_pipeline.py
@@ -1,8 +1,9 @@
-import bentoml_io as bentoml
 import joblib
 from sklearn import datasets
 from sklearn import svm
 
+import bentoml
+
 if __name__ == "__main__":
     # Load training data
     iris = datasets.load_iris()
@@ -13,8 +14,6 @@
     clf.fit(X, y)
 
     # Save model to BentoML local model store
-    with bentoml.models.create(
-        "iris_clf",
-    ) as bento_model:
+    with bentoml.models.create("iris_clf") as bento_model:
         joblib.dump(clf, bento_model.path_of("model.pkl"))
     print(f"Model saved: {bento_model}")
diff --git a/examples/quickstart/service.py b/examples/quickstart/service.py
index 2b1e953e049..25e83b9dc55 100644
--- a/examples/quickstart/service.py
+++ b/examples/quickstart/service.py
@@ -1,6 +1,7 @@
-import bentoml_io as bentoml
 import numpy as np
 
+import bentoml
+
 
 @bentoml.service(resources={"cpu": "200m", "memory": "512Mi"})
 class Preprocessing:
diff --git a/examples/sentence-embedding/service.py b/examples/sentence-embedding/service.py
index 4160e90686b..24ad0be0b45 100644
--- a/examples/sentence-embedding/service.py
+++ b/examples/sentence-embedding/service.py
@@ -2,18 +2,14 @@
 
 import os
 
-import bentoml_io
 import numpy as np
 import torch
-from pydantic import Field
 
 import bentoml
+from bentoml import Field
 
 
-@bentoml_io.service(
-    resources={"memory": "500MiB"},
-    traffic={"timeout": 1},
-)
+@bentoml.service(resources={"memory": "500MiB"}, traffic={"timeout": 1})
 class SentenceEmbedding:
     model_ref = bentoml.models.get("all-MiniLM-L6-v2")
 
@@ -43,7 +39,7 @@ def mean_pooling(model_output, attention_mask):
             input_mask_expanded.sum(1), min=1e-9
         )
 
-    @bentoml_io.api(batchable=True)
+    @bentoml.api(batchable=True)
     def encode(
         self,
         sentences: list[str] = Field(["hello world"], description="input sentences"),
diff --git a/pyproject.toml b/pyproject.toml
index 1a876a511a7..c7838019f57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -133,9 +133,9 @@ fallback_version = "0.0.0"
 [tool.hatch.metadata]
 allow-direct-references = true
 [tool.hatch.build.targets.sdist]
-only-include = ["src/bentoml", "src/bentoml_cli", "src/bentoml_io"]
+only-include = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"]
 [tool.hatch.build.targets.wheel]
-packages = ["src/bentoml", "src/bentoml_cli", "src/bentoml_io"]
+packages = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"]
 
 [[tool.pdm.source]]
 url = "https://download.pytorch.org/whl/cpu"
@@ -378,7 +378,7 @@ convention = "google"
 
 [tool.ruff.isort]
 force-single-line = true
-known-first-party = ["bentoml"]
+known-first-party = ["bentoml", "bentoml_cli", "_bentoml_sdk", "_bentoml_impl"]
 
 [tool.pyright]
 pythonVersion = "3.12"
diff --git a/src/bentoml_io/worker/__init__.py b/src/_bentoml_impl/__init__.py
similarity index 100%
rename from src/bentoml_io/worker/__init__.py
rename to src/_bentoml_impl/__init__.py
diff --git a/src/bentoml_io/arrow.py b/src/_bentoml_impl/arrow.py
similarity index 98%
rename from src/bentoml_io/arrow.py
rename to src/_bentoml_impl/arrow.py
index ded26ff4182..d64362e71f1 100644
--- a/src/bentoml_io/arrow.py
+++ b/src/_bentoml_impl/arrow.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 from pydantic import BaseModel
 
-from .types import arrow_serialization
+from _bentoml_sdk.schema import arrow_serialization
 
 SchemaDict: t.TypeAlias = t.Dict[str, t.Any]
 
diff --git a/src/bentoml_io/client/__init__.py b/src/_bentoml_impl/client/__init__.py
similarity index 100%
rename from src/bentoml_io/client/__init__.py
rename to src/_bentoml_impl/client/__init__.py
diff --git a/src/bentoml_io/client/base.py b/src/_bentoml_impl/client/base.py
similarity index 100%
rename from src/bentoml_io/client/base.py
rename to src/_bentoml_impl/client/base.py
diff --git a/src/bentoml_io/client/http.py b/src/_bentoml_impl/client/http.py
similarity index 86%
rename from src/bentoml_io/client/http.py
rename to src/_bentoml_impl/client/http.py
index 4cd54a368d6..666be48e78b 100644
--- a/src/bentoml_io/client/http.py
+++ b/src/_bentoml_impl/client/http.py
@@ -5,6 +5,8 @@
 import inspect
 import io
 import logging
+import pathlib
+import tempfile
 import typing as t
 from http import HTTPStatus
 from urllib.parse import urljoin
@@ -14,12 +16,12 @@
 import attr
 from pydantic import RootModel
 
+from _bentoml_sdk import IODescriptor
+from _bentoml_sdk.typing_utils import is_file_like
+from _bentoml_sdk.typing_utils import is_image_type
 from bentoml._internal.utils.uri import uri_to_path
 from bentoml.exceptions import BentoMLException
 
-from ..types import File
-from ..typing_utils import is_file_like
-from ..typing_utils import is_image_type
 from .base import AbstractClient
 
 if t.TYPE_CHECKING:
@@ -28,8 +30,7 @@
     from aiohttp import ClientSession
     from aiohttp import MultipartWriter
 
-    from ..factory import Service
-    from ..io_models import IODescriptor
+    from _bentoml_sdk import Service
 
     T = t.TypeVar("T", bound="HTTPClient")
 
@@ -37,6 +38,10 @@
 MAX_RETRIES = 3
 
 
+def _is_file(obj: t.Any) -> bool:
+    return isinstance(obj, pathlib.PurePath) or is_file_like(obj)
+
+
 @attr.define(slots=True)
 class ClientEndpoint:
     name: str
@@ -59,6 +64,14 @@ class HTTPClient(AbstractClient):
     token: str | None = None
     _client: ClientSession | None = attr.field(init=False, default=None)
     _loop: asyncio.AbstractEventLoop | None = attr.field(init=False, default=None)
+    _opened_files: list[io.BufferedReader] = attr.field(init=False, factory=list)
+    _temp_dir: tempfile.TemporaryDirectory[str] = attr.field(init=False)
+
+    @_temp_dir.default
+    def default_temp_dir(self) -> tempfile.TemporaryDirectory[str]:
+        return tempfile.TemporaryDirectory(
+            prefix="bentoml-client-", ignore_cleanup_errors=True
+        )
 
     def __init__(
         self,
@@ -201,17 +214,22 @@ async def _call(
             endpoint = self.endpoints[name]
         except KeyError:
             raise BentoMLException(f"Endpoint {name} not found") from None
-        data = await self._prepare_request(endpoint, args, kwargs)
-        resp = await self._request(endpoint.route, data, headers=headers)
-        if endpoint.stream_output:
-            return self._parse_stream_response(endpoint, resp)
-        elif (
-            endpoint.output.get("type") == "file"
-            and self.media_type == "application/json"
-        ):
-            return await self._parse_file_response(endpoint, resp)
-        else:
-            return await self._parse_response(endpoint, resp)
+        try:
+            data = await self._prepare_request(endpoint, args, kwargs)
+            resp = await self._request(endpoint.route, data, headers=headers)
+            if endpoint.stream_output:
+                return self._parse_stream_response(endpoint, resp)
+            elif (
+                endpoint.output.get("type") == "file"
+                and self.media_type == "application/json"
+            ):
+                return await self._parse_file_response(endpoint, resp)
+            else:
+                return await self._parse_response(endpoint, resp)
+        finally:
+            for f in self._opened_files:
+                f.close()
+            self._opened_files.clear()
 
     async def _request(
         self,
@@ -262,10 +280,10 @@ def is_file_field(k: str) -> bool:
             if isinstance(model, IODescriptor):
                 return k in model.multipart_fields
             return (
-                is_file_like(value := model[k])
+                _is_file(value := model[k])
                 or isinstance(value, t.Sequence)
                 and len(value) > 0
-                and is_file_like(value[0])
+                and _is_file(value[0])
             )
 
         if isinstance(model, dict):
@@ -287,6 +305,10 @@ def is_file_field(k: str) -> bool:
                                 getattr(v, "_fp", v.fp),
                                 headers={"Content-Type": f"image/{v.format.lower()}"},
                             )
+                        elif isinstance(v, pathlib.PurePath):
+                            file = open(v, "rb")
+                            part = mp.append(file)
+                            self._opened_files.append(file)
                         else:
                             part = mp.append(v)
                         part.set_content_disposition(
@@ -294,9 +316,6 @@ def is_file_field(k: str) -> bool:
                         )
                 return mp
         elif isinstance(model, dict):
-            for k, v in data.items():
-                if is_file_field(v) and not isinstance(v, File):
-                    data[k] = File(v)
             return self.serde.serialize(data)
         else:
             return self.serde.serialize_model(model)
@@ -309,10 +328,10 @@ async def _prepare_request(
     ) -> bytes | MultipartWriter:
         if endpoint.input_spec is not None:
             model = endpoint.input_spec.from_inputs(*args, **kwargs)
-            if not model.multipart_fields:
-                return self.serde.serialize_model(model)
-            else:
+            if model.multipart_fields:
                 return self._build_multipart(model)
+            else:
+                return self.serde.serialize_model(model)
 
         for name, value in zip(endpoint.input["properties"], args):
             if name in kwargs:
@@ -334,7 +353,7 @@ async def _prepare_request(
         multipart_fields = {
             k
             for k, v in kwargs.items()
-            if is_file_like(v) or isinstance(v, t.Sequence) and v and is_file_like(v[0])
+            if _is_file(v) or isinstance(v, t.Sequence) and v and _is_file(v[0])
         }
         if multipart_fields:
             return self._build_multipart(kwargs)
@@ -371,21 +390,21 @@ async def _parse_stream_response(
 
     async def _parse_file_response(
         self, endpoint: ClientEndpoint, resp: ClientResponse
-    ) -> File:
-        from ..types import Audio
-        from ..types import Image
-        from ..types import Video
-
-        content_type = resp.headers.get("Content-Type")
-        cls = File
-        if content_type:
-            if content_type.startswith("image/"):
-                cls = Image
-            elif content_type.startswith("audio/"):
-                cls = Audio
-            elif content_type.startswith("video/"):
-                cls = Video
-        return cls(io.BytesIO(await resp.read()), media_type=content_type)
+    ) -> pathlib.Path:
+        from multipart.multipart import parse_options_header
+
+        content_disposition = resp.headers.get("content-disposition")
+        filename: str | None = None
+        if content_disposition:
+            _, options = parse_options_header(content_disposition)
+            if b"filename" in options:
+                filename = str(options[b"filename"], "utf-8", errors="ignore")
+
+        with tempfile.NamedTemporaryFile(
+            "wb", suffix=filename, dir=self._temp_dir.name, delete=False
+        ) as f:
+            f.write(await resp.read())
+        return pathlib.Path(f.name)
 
     async def close(self) -> None:
         if self._client is not None and not self._client.closed:
diff --git a/src/bentoml_io/client/proxy.py b/src/_bentoml_impl/client/proxy.py
similarity index 97%
rename from src/bentoml_io/client/proxy.py
rename to src/_bentoml_impl/client/proxy.py
index fb29ecf7896..4094a54dfe4 100644
--- a/src/bentoml_io/client/proxy.py
+++ b/src/_bentoml_impl/client/proxy.py
@@ -6,10 +6,10 @@
 import logging
 import typing as t
 
+from _bentoml_sdk import Service
+from _bentoml_sdk.api import APIMethod
 from bentoml._internal.utils import async_gen_to_sync
 from bentoml.exceptions import BentoMLException
-from bentoml_io.api import APIMethod
-from bentoml_io.factory import Service
 
 from .http import ClientEndpoint
 from .http import HTTPClient
diff --git a/src/bentoml_io/serde.py b/src/_bentoml_impl/serde.py
similarity index 97%
rename from src/bentoml_io/serde.py
rename to src/_bentoml_impl/serde.py
index 7672047655d..ad761ac4e17 100644
--- a/src/bentoml_io/serde.py
+++ b/src/_bentoml_impl/serde.py
@@ -8,12 +8,12 @@
 
 from starlette.datastructures import UploadFile
 
-from .typing_utils import is_list_type
+from _bentoml_sdk.typing_utils import is_list_type
 
 if t.TYPE_CHECKING:
     from starlette.requests import Request
 
-    from .io_models import IODescriptor
+    from _bentoml_sdk import IODescriptor
 
 T = t.TypeVar("T", bound="IODescriptor")
 
diff --git a/src/bentoml_io/server/.gitignore b/src/_bentoml_impl/server/.gitignore
similarity index 100%
rename from src/bentoml_io/server/.gitignore
rename to src/_bentoml_impl/server/.gitignore
diff --git a/src/bentoml_io/server/__init__.py b/src/_bentoml_impl/server/__init__.py
similarity index 51%
rename from src/bentoml_io/server/__init__.py
rename to src/_bentoml_impl/server/__init__.py
index c3957f029f8..b4e68d862c8 100644
--- a/src/bentoml_io/server/__init__.py
+++ b/src/_bentoml_impl/server/__init__.py
@@ -1,8 +1,10 @@
 """
-    bentoml_io.server
-    ~~~~~~~~~~~~~~~~~
+    _bentoml_impl.server
+    ~~~~~~~~~~~~~~~~~~~~
 
     A reference implementation of serving a BentoML service.
     This will be eventually migrated to Rust.
 """
-from ..factory import Service as Service
+from .serving import serve_http
+
+__all__ = ["serve_http"]
diff --git a/src/bentoml_io/server/allocator.py b/src/_bentoml_impl/server/allocator.py
similarity index 99%
rename from src/bentoml_io/server/allocator.py
rename to src/_bentoml_impl/server/allocator.py
index 3b05a6366f0..c888bdbfd9c 100644
--- a/src/bentoml_io/server/allocator.py
+++ b/src/_bentoml_impl/server/allocator.py
@@ -5,12 +5,11 @@
 from simple_di import Provide
 from simple_di import inject
 
+from _bentoml_sdk import Service
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.resource import system_resources
 from bentoml.exceptions import BentoMLConfigException
 
-from ..factory import Service
-
 
 class ResourceUnavailable(BentoMLConfigException):
     pass
diff --git a/src/bentoml_io/server/app.py b/src/_bentoml_impl/server/app.py
similarity index 72%
rename from src/bentoml_io/server/app.py
rename to src/_bentoml_impl/server/app.py
index a6b700d74d9..28d41f19af8 100644
--- a/src/bentoml_io/server/app.py
+++ b/src/_bentoml_impl/server/app.py
@@ -13,14 +13,14 @@
 from starlette.middleware import Middleware
 from starlette.staticfiles import StaticFiles
 
+from _bentoml_sdk import Service
 from bentoml._internal.container import BentoMLContainer
 from bentoml._internal.marshal.dispatcher import CorkDispatcher
 from bentoml._internal.server.base_app import BaseAppFactory
 from bentoml._internal.server.http_app import log_exception
+from bentoml.exceptions import BentoMLException
 from bentoml.exceptions import ServiceUnavailable
 
-from ..factory import Service
-
 if t.TYPE_CHECKING:
     from opentelemetry.sdk.trace import Span
     from starlette.applications import Starlette
@@ -28,11 +28,31 @@
     from starlette.responses import Response
     from starlette.routing import BaseRoute
 
+    from bentoml._internal import external_typing as ext
+    from bentoml._internal.context import ServiceContext
     from bentoml._internal.types import LifecycleHook
 
 R = t.TypeVar("R")
 
 
+class ContextMiddleware:
+    def __init__(self, app: ext.ASGIApp, context: ServiceContext) -> None:
+        self.app = app
+        self.context = context
+
+    async def __call__(
+        self, scope: ext.ASGIScope, receive: ext.ASGIReceive, send: ext.ASGISend
+    ) -> None:
+        from starlette.requests import Request
+
+        if scope["type"] not in ("http", "websocket"):
+            return await self.app(scope, receive, send)
+
+        req = Request(scope, receive, send)
+        with self.context.in_request(req):
+            await self.app(scope, receive, send)
+
+
 class ServiceAppFactory(BaseAppFactory):
     @inject
     def __init__(
@@ -87,8 +107,34 @@ async def openapi_spec_view(self, req: Request) -> Response:
             log_exception(req, sys.exc_info())
             raise
 
+    async def handle_uncaught_exception(self, req: Request, exc: Exception) -> Response:
+        from starlette.responses import JSONResponse
+
+        log_exception(req, sys.exc_info())
+        return JSONResponse(
+            {"error": "An unexpected error has occurred, please try again later."},
+            status_code=500,
+        )
+
+    async def handle_bentoml_exception(self, req: Request, exc: Exception) -> Response:
+        from starlette.responses import JSONResponse
+
+        log_exception(req, sys.exc_info())
+        assert isinstance(exc, BentoMLException)
+        status = exc.error_code.value
+        if 400 <= status < 500 and status not in (401, 403):
+            return JSONResponse(
+                content="BentoService error handling API request: %s" % str(exc),
+                status_code=status,
+            )
+        else:
+            return JSONResponse("", status_code=status)
+
     def __call__(self, is_main: bool = False) -> Starlette:
         app = super().__call__()
+
+        app.add_exception_handler(BentoMLException, self.handle_bentoml_exception)
+        app.add_exception_handler(Exception, self.handle_uncaught_exception)
         app.add_route("/schema.json", self.schema_view, name="schema")
         if is_main:
             if BentoMLContainer.new_index:
@@ -119,7 +165,9 @@ def middlewares(self) -> list[Middleware]:
 
         from bentoml._internal.container import BentoMLContainer
 
-        middlewares = super().middlewares
+        middlewares = super().middlewares + [
+            Middleware(ContextMiddleware, context=self.service.context)
+        ]
 
         for middleware_cls, options in self.service.middlewares:
             middlewares.append(Middleware(middleware_cls, **options))
@@ -261,16 +309,14 @@ async def inner_infer(
 
     async def api_endpoint(self, name: str, request: Request) -> Response:
         from starlette.concurrency import run_in_threadpool
-        from starlette.responses import JSONResponse
 
+        from _bentoml_sdk.io_models import ARGS
+        from _bentoml_sdk.io_models import KWARGS
         from bentoml._internal.container import BentoMLContainer
         from bentoml._internal.context import trace_context
         from bentoml._internal.utils import is_async_callable
         from bentoml._internal.utils.http import set_cookies
-        from bentoml.exceptions import BentoMLException
 
-        from ..io_models import ARGS
-        from ..io_models import KWARGS
         from ..serde import ALL_SERDE
 
         media_type = request.headers.get("Content-Type", "application/json")
@@ -278,68 +324,41 @@ async def api_endpoint(self, name: str, request: Request) -> Response:
 
         method = self.service.apis[name]
         func = getattr(self._service_instance, name)
-
-        with self.service.context.in_request(request) as ctx:
-            try:
-                serde = ALL_SERDE[media_type]()
-                input_data = await method.input_spec.from_http_request(request, serde)
-                input_args: tuple[t.Any, ...] = ()
-                input_params = {
-                    k: getattr(input_data, k) for k in input_data.model_fields
-                }
-                if method.ctx_param is not None:
-                    input_params[method.ctx_param] = ctx
-                if ARGS in input_params:
-                    input_args = tuple(input_params.pop(ARGS))
-                if KWARGS in input_params:
-                    input_params.update(input_params.pop(KWARGS))
-                if method.batchable:
-                    output = await self.batch_infer(name, input_args, input_params)
-                elif is_async_callable(func):
-                    output = await func(*input_args, **input_params)
-                elif inspect.isasyncgenfunction(func):
-                    output = func(*input_args, **input_params)
-                else:
-                    output = await run_in_threadpool(func, *input_args, **input_params)
-
-                response = await method.output_spec.to_http_response(output, serde)
-                response.headers.update(
-                    {"Server": f"BentoML Service/{self.service.name}"}
-                )
-
-                if method.ctx_param is not None:
-                    response.status_code = ctx.response.status_code
-                    response.headers.update(ctx.response.metadata)
-                    set_cookies(response, ctx.response.cookies)
-                if trace_context.request_id is not None:
-                    response.headers["X-BentoML-Request-ID"] = str(
-                        trace_context.request_id
-                    )
-                if (
-                    BentoMLContainer.http.response.trace_id.get()
-                    and trace_context.trace_id is not None
-                ):
-                    response.headers["X-BentoML-Trace-ID"] = str(trace_context.trace_id)
-            except BentoMLException as e:
-                log_exception(request, sys.exc_info())
-
-                status = e.error_code.value
-                if 400 <= status < 500 and status not in (401, 403):
-                    response = JSONResponse(
-                        content="BentoService error handling API request: %s" % str(e),
-                        status_code=status,
-                    )
-                else:
-                    response = JSONResponse("", status_code=status)
-            except Exception:  # pylint: disable=broad-except
-                # For all unexpected error, return 500 by default. For example,
-                # if users' model raises an error of division by zero.
-                log_exception(request, sys.exc_info())
-
-                response = JSONResponse(
-                    "An error has occurred in BentoML user code when handling this request, find the error details in server logs",
-                    status_code=500,
-                )
-            finally:
-                await request.close()
-            return response
+        ctx = self.service.context
+        try:
+            serde = ALL_SERDE[media_type]()
+            input_data = await method.input_spec.from_http_request(request, serde)
+            input_args: tuple[t.Any, ...] = ()
+            input_params = {k: getattr(input_data, k) for k in input_data.model_fields}
+            if method.ctx_param is not None:
+                input_params[method.ctx_param] = ctx
+            if ARGS in input_params:
+                input_args = tuple(input_params.pop(ARGS))
+            if KWARGS in input_params:
+                input_params.update(input_params.pop(KWARGS))
+            if method.batchable:
+                output = await self.batch_infer(name, input_args, input_params)
+            elif is_async_callable(func):
+                output = await func(*input_args, **input_params)
+            elif inspect.isasyncgenfunction(func):
+                output = func(*input_args, **input_params)
+            else:
+                output = await run_in_threadpool(func, *input_args, **input_params)
+
+            response = await method.output_spec.to_http_response(output, serde)
+            response.headers.update({"Server": f"BentoML Service/{self.service.name}"})
+
+            if method.ctx_param is not None:
+                response.status_code = ctx.response.status_code
+                response.headers.update(ctx.response.metadata)
+                set_cookies(response, ctx.response.cookies)
+            if trace_context.request_id is not None:
+                response.headers["X-BentoML-Request-ID"] = str(trace_context.request_id)
+            if (
+                BentoMLContainer.http.response.trace_id.get()
+                and trace_context.trace_id is not None
+            ):
+                response.headers["X-BentoML-Trace-ID"] = str(trace_context.trace_id)
+        finally:
+            await request.close()
+        return response
diff --git a/src/bentoml_io/server/main-openapi.html b/src/_bentoml_impl/server/main-openapi.html
similarity index 100%
rename from src/bentoml_io/server/main-openapi.html
rename to src/_bentoml_impl/server/main-openapi.html
diff --git a/src/bentoml_io/server/main-ui.html b/src/_bentoml_impl/server/main-ui.html
similarity index 100%
rename from src/bentoml_io/server/main-ui.html
rename to src/_bentoml_impl/server/main-ui.html
diff --git a/src/bentoml_io/server/serving.py b/src/_bentoml_impl/server/serving.py
similarity index 98%
rename from src/bentoml_io/server/serving.py
rename to src/_bentoml_impl/server/serving.py
index 3c7ff511506..2e07fbc813c 100644
--- a/src/bentoml_io/server/serving.py
+++ b/src/_bentoml_impl/server/serving.py
@@ -14,11 +14,10 @@
 from simple_di import Provide
 from simple_di import inject
 
+from _bentoml_sdk import Service
 from bentoml._internal.container import BentoMLContainer
 from bentoml.exceptions import BentoMLConfigException
 
-from ..factory import Service
-
 AnyService = Service[t.Any]
 
 if t.TYPE_CHECKING:
@@ -92,7 +91,7 @@ def _get_server_socket(
         raise BentoMLException("Unsupported platform")
 
 
-SERVICE_WORKER_SCRIPT = "bentoml_io.worker.service"
+SERVICE_WORKER_SCRIPT = "_bentoml_impl.worker.service"
 
 
 def create_service_watchers(
@@ -128,7 +127,7 @@ def create_service_watchers(
         "--fd",
         f"$(circus.sockets.{svc.name})",
         "--working-dir",
-        svc._working_dir,
+        svc.working_dir,
         "--worker-id",
         "$(CIRCUS.WID)",
     ]
diff --git a/src/_bentoml_impl/worker/__init__.py b/src/_bentoml_impl/worker/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/bentoml_io/worker/service.py b/src/_bentoml_impl/worker/service.py
similarity index 97%
rename from src/bentoml_io/worker/service.py
rename to src/_bentoml_impl/worker/service.py
index db2c95f26eb..f91bcc437ac 100644
--- a/src/bentoml_io/worker/service.py
+++ b/src/_bentoml_impl/worker/service.py
@@ -6,6 +6,9 @@
 
 import click
 
+if t.TYPE_CHECKING:
+    from _bentoml_sdk import Service
+
 
 @click.command()
 @click.argument("bento_identifier", type=click.STRING, required=False, default=".")
@@ -143,14 +146,13 @@ def main(
     from bentoml._internal.log import configure_server_logging
     from bentoml._internal.service import load
 
-    from ..factory import Service
     from ..server.app import ServiceAppFactory
 
     if runner_map:
         BentoMLContainer.remote_runner_mapping.set(
             t.cast(t.Dict[str, str], json.loads(runner_map))
         )
-    service = t.cast(Service[t.Any], load(bento_identifier, working_dir=working_dir))
+    service = t.cast("Service[t.Any]", load(bento_identifier, working_dir=working_dir))
     service.inject_config()
 
     component_context.component_type = "api_server"
diff --git a/src/_bentoml_sdk/__init__.py b/src/_bentoml_sdk/__init__.py
new file mode 100644
index 00000000000..15050b193dc
--- /dev/null
+++ b/src/_bentoml_sdk/__init__.py
@@ -0,0 +1,34 @@
+from bentoml._internal.utils.pkg import pkg_version_info
+
+if (ver := pkg_version_info("pydantic")) < (2,):
+    raise ImportError(
+        f"The new SDK runs on pydantic>=2.0.0, but the you have {'.'.join(map(str, ver))}. "
+        "Please upgrade it."
+    )
+
+from ._pydantic import add_custom_preparers
+
+add_custom_preparers()
+del add_custom_preparers
+# ruff: noqa
+
+from .api import api
+from .service import depends
+from .service import Service
+from .service import service
+from .service import runner_service
+from .io_models import IODescriptor
+from .schema import ContentType, DType, Shape
+
+__all__ = [
+    "api",
+    "depends",
+    "Service",
+    "service",
+    "runner_service",
+    # io descriptors
+    "IODescriptor",
+    "ContentType",
+    "DType",
+    "Shape",
+]
diff --git a/src/bentoml_io/_pydantic.py b/src/_bentoml_sdk/_pydantic.py
similarity index 58%
rename from src/bentoml_io/_pydantic.py
rename to src/_bentoml_sdk/_pydantic.py
index 56229628f8a..62f6d814c22 100644
--- a/src/bentoml_io/_pydantic.py
+++ b/src/_bentoml_sdk/_pydantic.py
@@ -6,9 +6,13 @@
 from typing_extensions import get_args
 from typing_extensions import get_origin
 
-from .types import DataframeSchema
-from .types import PILImageEncoder
-from .types import TensorSchema
+from .schema import ContentType
+from .schema import DataframeSchema
+from .schema import DType
+from .schema import FileSchema
+from .schema import PILImageEncoder
+from .schema import Shape
+from .schema import TensorSchema
 
 if t.TYPE_CHECKING:
     from pydantic import ConfigDict
@@ -44,18 +48,19 @@ def numpy_prepare_pydantic_annotations(
 
     args = get_args(source)
     dtype = np.dtype(args[1]).name if args else None
+    shape: tuple[int, ...] | None = None
 
     _, remaining_annotations = _known_annotated_metadata.collect_known_metadata(
         annotations
     )
-    schema = next(
-        (a for a in remaining_annotations if isinstance(a, TensorSchema)), None
-    )
-    if schema is None:
-        remaining_annotations.insert(0, TensorSchema("numpy-array", dtype=dtype))
-    elif schema.dtype is None:
-        schema.dtype = dtype
-    return source, remaining_annotations
+    for i, annotation in enumerate(remaining_annotations[:]):
+        if isinstance(annotation, Shape):
+            shape = annotation.dimensions
+            del remaining_annotations[i]
+        elif isinstance(annotation, DType):
+            dtype = annotation.dtype
+            del remaining_annotations[i]
+    return source, [TensorSchema("numpy-array", dtype, shape), *remaining_annotations]
 
 
 def torch_prepare_pydantic_annotations(
@@ -69,12 +74,20 @@ def torch_prepare_pydantic_annotations(
     if not issubclass(origin, torch.Tensor):
         return None
 
+    dtype: str | None = None
+    shape: tuple[int, ...] | None = None
+
     _, remaining_annotations = _known_annotated_metadata.collect_known_metadata(
         annotations
     )
-    if not any(isinstance(a, TensorSchema) for a in remaining_annotations):
-        remaining_annotations.insert(0, TensorSchema("torch-tensor"))
-    return origin, remaining_annotations
+    for i, annotation in enumerate(remaining_annotations[:]):
+        if isinstance(annotation, Shape):
+            shape = annotation.dimensions
+            del remaining_annotations[i]
+        elif isinstance(annotation, DType):
+            dtype = annotation.dtype
+            del remaining_annotations[i]
+    return source, [TensorSchema("torch-tensor", dtype, shape), *remaining_annotations]
 
 
 def tf_prepare_pydantic_annotations(
@@ -87,12 +100,20 @@ def tf_prepare_pydantic_annotations(
     if not issubclass(origin, tf.Tensor):
         return None
 
+    dtype: str | None = None
+    shape: tuple[int, ...] | None = None
+
     _, remaining_annotations = _known_annotated_metadata.collect_known_metadata(
         annotations
     )
-    if not any(isinstance(a, TensorSchema) for a in remaining_annotations):
-        remaining_annotations.insert(0, TensorSchema("tf-tensor"))
-    return origin, remaining_annotations
+    for i, annotation in enumerate(remaining_annotations[:]):
+        if isinstance(annotation, Shape):
+            shape = annotation.dimensions
+            del remaining_annotations[i]
+        elif isinstance(annotation, DType):
+            dtype = annotation.dtype
+            del remaining_annotations[i]
+    return source, [TensorSchema("tf-tensor", dtype, shape), *remaining_annotations]
 
 
 def pandas_prepare_pydantic_annotations(
@@ -114,7 +135,7 @@ def pandas_prepare_pydantic_annotations(
 
 
 def pil_prepare_pydantic_annotations(
-    source: t.Any, annotations: t.Iterable[t.Any], config: ConfigDict
+    source: t.Any, annotations: t.Iterable[t.Any], _config: ConfigDict
 ) -> tuple[t.Any, list[t.Any]] | None:
     if not getattr(source, "__module__", "").startswith("PIL."):
         return None
@@ -126,9 +147,33 @@ def pil_prepare_pydantic_annotations(
     _, remaining_annotations = _known_annotated_metadata.collect_known_metadata(
         annotations
     )
-    if not any(isinstance(a, PILImageEncoder) for a in remaining_annotations):
-        remaining_annotations.insert(0, PILImageEncoder())
-    return origin, remaining_annotations
+    return origin, [PILImageEncoder(), *remaining_annotations]
+
+
+def pathlib_prepare_pydantic_annotations(
+    source: t.Any, annotations: t.Iterable[t.Any], _config: ConfigDict
+) -> tuple[t.Any, list[t.Any]] | None:
+    import pathlib
+
+    if source not in {
+        pathlib.PurePath,
+        pathlib.PurePosixPath,
+        pathlib.PureWindowsPath,
+        pathlib.Path,
+        pathlib.PosixPath,
+        pathlib.WindowsPath,
+    }:
+        return None
+
+    _, remaining_annotations = _known_annotated_metadata.collect_known_metadata(
+        annotations
+    )
+    content_type: str | None = None
+    for i, annotation in enumerate(remaining_annotations[:]):
+        if isinstance(annotation, ContentType):
+            content_type = annotation.content_type
+            del remaining_annotations[i]
+    return source, [FileSchema(content_type=content_type), *remaining_annotations]
 
 
 def add_custom_preparers():
@@ -137,6 +182,9 @@ def add_custom_preparers():
     except ModuleNotFoundError:
         return
     _std_types_schema.PREPARE_METHODS = (
+        # pathlib
+        pathlib_prepare_pydantic_annotations,
+        # inherit from pydantic
         *_std_types_schema.PREPARE_METHODS,
         # tensors
         numpy_prepare_pydantic_annotations,
diff --git a/src/bentoml_io/api.py b/src/_bentoml_sdk/api.py
similarity index 98%
rename from src/bentoml_io/api.py
rename to src/_bentoml_sdk/api.py
index 19b1ca4cf3c..6d526ee600a 100644
--- a/src/bentoml_io/api.py
+++ b/src/_bentoml_sdk/api.py
@@ -12,7 +12,6 @@
 from bentoml._internal.utils import dict_filter_none
 
 from .io_models import IODescriptor
-from .openapi import REF_TEMPLATE
 
 R = t.TypeVar("R")
 T = t.TypeVar("T", bound="APIMethod[..., t.Any]")
@@ -121,6 +120,8 @@ def schema(self) -> dict[str, t.Any]:
         )
 
     def openapi_request(self) -> dict[str, t.Any]:
+        from .service.openapi import REF_TEMPLATE
+
         return {
             "content": {
                 self.input_spec.mime_type(): MediaType(
@@ -137,6 +138,8 @@ def openapi_request(self) -> dict[str, t.Any]:
         }
 
     def openapi_response(self) -> dict[str, t.Any]:
+        from .service.openapi import REF_TEMPLATE
+
         return {
             "description": SUCCESS_DESCRIPTION,
             "content": {
diff --git a/src/bentoml_io/io_models.py b/src/_bentoml_sdk/io_models.py
similarity index 85%
rename from src/bentoml_io/io_models.py
rename to src/_bentoml_sdk/io_models.py
index 2a64f08902c..4d53a9d8af8 100644
--- a/src/bentoml_io/io_models.py
+++ b/src/_bentoml_sdk/io_models.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
 import inspect
+import pathlib
 import sys
 import typing as t
 from typing import ClassVar
-from urllib.parse import quote
 
 from pydantic import BaseModel
 from pydantic import Field
@@ -14,8 +14,6 @@
 
 from bentoml._internal.service.openapi.specification import Schema
 
-from .types import File
-from .typing_utils import is_file_like
 from .typing_utils import is_image_type
 from .typing_utils import is_iterator_type
 from .typing_utils import is_list_type
@@ -24,7 +22,7 @@
     from starlette.requests import Request
     from starlette.responses import Response
 
-    from .serde import Serde
+    from _bentoml_impl.serde import Serde
 
 
 DEFAULT_TEXT_MEDIA_TYPE = "text/plain"
@@ -36,7 +34,7 @@ class IOMixin:
 
     @classmethod
     def openapi_components(cls, name: str) -> dict[str, Schema]:
-        from .openapi import REF_TEMPLATE
+        from .service.openapi import REF_TEMPLATE
 
         if issubclass(cls, RootModel):
             return {}
@@ -67,6 +65,8 @@ def mime_type(cls) -> str:
         if json_schema.get("type") == "string":
             return DEFAULT_TEXT_MEDIA_TYPE
         elif json_schema.get("type") == "file":
+            if "content_type" in json_schema:
+                return json_schema["content_type"]
             if (format := json_schema.get("format")) == "image":
                 return "image/*"
             elif format == "audio":
@@ -87,7 +87,7 @@ def __pydantic_init_subclass__(cls) -> None:
                 annotation = get_args(annotation)[0]
             if is_annotated(annotation):
                 annotation = get_args(annotation)[0]
-            if is_file_like(annotation) or is_image_type(annotation):
+            if issubclass(annotation, pathlib.PurePath) or is_image_type(annotation):
                 cls.multipart_fields.append(k)
 
     @classmethod
@@ -117,12 +117,14 @@ async def from_http_request(cls, request: Request, serde: Serde) -> IODescriptor
     @classmethod
     async def to_http_response(cls, obj: t.Any, serde: Serde) -> Response:
         """Convert a output value to HTTP response"""
+        import mimetypes
+
         from pydantic import RootModel
         from starlette.responses import FileResponse
         from starlette.responses import Response
         from starlette.responses import StreamingResponse
 
-        from .serde import JSONSerde
+        from _bentoml_impl.serde import JSONSerde
 
         structured_media_type = cls.media_type or serde.media_type
 
@@ -157,32 +159,17 @@ def content_stream() -> t.Generator[str | bytes, None, None]:
 
             return StreamingResponse(content_stream(), media_type=cls.mime_type())
         else:
-            if isinstance(obj, File) and isinstance(serde, JSONSerde):
-                media_type = obj.media_type or "application/octet-stream"
-                should_inline = obj.media_type and obj.media_type.startswith(
-                    ("image/", "audio/", "video/")
-                )
+            if isinstance(obj, pathlib.PurePath) and isinstance(serde, JSONSerde):
+                media_type = mimetypes.guess_type(obj)[0] or "application/octet-stream"
+                should_inline = media_type.startswith(("image/", "audio/", "video/"))
                 content_disposition_type = "inline" if should_inline else "attachment"
-                if obj.path:
-                    return FileResponse(
-                        obj.path,
-                        filename=obj.filename,
-                        media_type=media_type,
-                        content_disposition_type=content_disposition_type,
-                    )
-                else:
-                    headers = None
-                    if obj.filename:
-                        if (quoted := quote(obj.filename)) != obj.filename:
-                            content_disposition = (
-                                f"{content_disposition_type}; filename*=utf-8''{quoted}"
-                            )
-                        else:
-                            content_disposition = (
-                                f'{content_disposition_type}; filename="{obj.filename}"'
-                            )
-                        headers = {"Content-Disposition": content_disposition}
-                    return Response(obj.read(), media_type=media_type, headers=headers)
+                return FileResponse(
+                    obj,
+                    filename=obj.name,
+                    media_type=media_type,
+                    content_disposition_type=content_disposition_type,
+                )
+
             if not isinstance(obj, RootModel):
                 ins: IODescriptor = t.cast(IODescriptor, cls(obj))
             else:
diff --git a/src/bentoml_io/types.py b/src/_bentoml_sdk/schema.py
similarity index 61%
rename from src/bentoml_io/types.py
rename to src/_bentoml_sdk/schema.py
index 03b9b86fdf2..3525ebd682a 100644
--- a/src/bentoml_io/types.py
+++ b/src/_bentoml_sdk/schema.py
@@ -1,17 +1,20 @@
 from __future__ import annotations
 
 import contextlib
+import fnmatch
 import functools
 import io
 import operator
+import os
+import tempfile
 import typing as t
-from mimetypes import guess_type
 from pathlib import Path
+from pathlib import PurePath
 
 import attrs
+from annotated_types import BaseMetadata
 from pydantic_core import core_schema
 from starlette.datastructures import UploadFile
-from typing_extensions import Annotated
 
 from bentoml._internal.utils import dict_filter_none
 
@@ -28,6 +31,7 @@
     from typing_extensions import Literal
 
     TensorType = t.Union[np.ndarray[t.Any, t.Any], tf.Tensor, torch.Tensor]
+    TensorFormat = Literal["numpy-array", "tf-tensor", "torch-tensor"]
     from PIL import Image as PILImage
 else:
     from bentoml._internal.utils.lazy_loader import LazyLoader
@@ -41,7 +45,6 @@
 
 T = t.TypeVar("T")
 
-__all__ = ["File", "Image", "Audio", "Video", "Tensor", "Dataframe"]
 # This is an internal global state that is True when the model is being serialized for arrow
 __in_arrow_serialization__ = False
 
@@ -65,11 +68,11 @@ def decode(
         if isinstance(obj, UploadFile):
             formats = None
             if obj.headers.get("Content-Type", "").startswith("image/"):
-                formats = [obj.headers.get("Content-Type")[6:].upper()]
+                formats = [obj.headers["Content-Type"][6:].upper()]
             return PILImage.open(obj.file, formats=formats)
         if is_file_like(obj):
             return PILImage.open(obj)
-        return PILImage.open(io.BytesIO(obj))
+        return PILImage.open(io.BytesIO(t.cast(bytes, obj)))
 
     def encode(self, obj: PILImage.Image) -> bytes:
         buffer = io.BytesIO()
@@ -97,142 +100,73 @@ def __get_pydantic_json_schema__(
 
 
 @attrs.define
-class File(t.BinaryIO):
-    format: t.ClassVar[str] = "binary"
+class FileSchema:
+    format: str = "binary"
+    content_type: str | None = None
 
-    _fp: t.BinaryIO | None = attrs.field(default=None, repr=False)
-    filename: str | None = None
-    media_type: str | None = None
-    path: Path | None = None
+    def __attrs_post_init__(self) -> None:
+        if self.content_type is not None:
+            self.format = self.content_type.split("/")[0]
 
-    def __attrs_post_int__(self) -> None:
-        if self.filename is None:
-            if self._fp is not None and hasattr(self._fp, "name"):
-                self.filename = self._fp.name
-
-    @property
-    def fp(self) -> t.BinaryIO:
-        if self._fp is None:
-            if self.path is None:
-                raise ValueError("File is not initialized")
-            self._fp = open(self.path, "rb")
-        return self._fp
-
-    @classmethod
-    def encode(cls, obj: t.BinaryIO) -> bytes:
-        obj.seek(0)
-        return obj.read()
-
-    @classmethod
     def __get_pydantic_json_schema__(
-        cls, schema: core_schema.CoreSchema, handler: GetJsonSchemaHandler
+        self, schema: core_schema.CoreSchema, handler: GetJsonSchemaHandler
     ) -> dict[str, t.Any]:
         value = handler(schema)
         if handler.mode == "validation":
-            value.update({"type": "file", "format": cls.format})
+            value.update({"type": "file", "format": self.format})
+            if self.content_type is not None:
+                value.update({"content_type": self.content_type})
         else:
             value.update({"type": "string", "format": "binary"})
         return value
 
-    @classmethod
-    def decode(cls, obj: bytes | t.BinaryIO | UploadFile) -> File:
+    def encode(self, obj: Path) -> bytes:
+        return obj.read_bytes()
+
+    def decode(self, obj: bytes | t.BinaryIO | UploadFile | PurePath) -> Path:
+        from bentoml._internal.context import request_directory
+
+        media_type: str | None = None
+        if isinstance(obj, PurePath):
+            return Path(obj)
         if isinstance(obj, UploadFile):
-            return cls(
-                obj.file,
-                filename=obj.filename,
-                media_type=obj.content_type,
+            body = obj.file.read()
+            filename = obj.filename
+            media_type = obj.content_type
+        elif is_file_like(obj):
+            body = obj.read()
+            filename = (
+                os.path.basename(fn)
+                if (fn := getattr(obj, "name", None)) is not None
+                else None
             )
-        if is_file_like(obj):
-            return cls(obj)
-        return cls(io.BytesIO(t.cast(bytes, obj)))
+        else:
+            body = t.cast(bytes, obj)
+            filename = None
+        if media_type is not None and self.content_type is not None:
+            if not fnmatch.fnmatch(media_type, self.content_type):
+                raise ValueError(
+                    f"Invalid content type {media_type}, expected {self.content_type}"
+                )
+        with tempfile.NamedTemporaryFile(
+            suffix=filename, dir=request_directory.get(), delete=False
+        ) as f:
+            f.write(body)
+            return Path(f.name)
 
-    @classmethod
     def __get_pydantic_core_schema__(
-        cls, source: type[t.Any], handler: t.Callable[[t.Any], core_schema.CoreSchema]
+        self, source: type[t.Any], handler: t.Callable[[t.Any], core_schema.CoreSchema]
     ) -> core_schema.CoreSchema:
         return core_schema.no_info_after_validator_function(
-            function=cls.decode,
+            function=self.decode,
             schema=core_schema.any_schema(),
-            serialization=core_schema.plain_serializer_function_ser_schema(cls.encode),
-        )
-
-    @classmethod
-    def from_path(cls, path: str | Path, filename: str | None = None) -> File:
-        if filename is None:
-            filename = Path(path).name
-        return cls(
-            open(path, "rb"),
-            filename=filename,
-            media_type=guess_type(filename)[0],
-            path=Path(path),
+            serialization=core_schema.plain_serializer_function_ser_schema(self.encode),
         )
 
-    def __enter__(self) -> t.BinaryIO:
-        return self
-
-    def __exit__(self, *args: t.Any) -> None:
-        self.close()
-
-    def close(self) -> None:
-        if self._fp is not None:
-            self._fp.close()
-
-    def read(self, __n: int = -1) -> bytes:
-        return self.fp.read(__n)
-
-    def readlines(self, __hint: int = -1) -> list[bytes]:
-        return self.fp.readlines(__hint)
-
-    def __iter__(self) -> t.Iterator[bytes]:
-        return self.fp.__iter__()
-
-    def seek(self, __offset: int, __whence: int = 0) -> int:
-        return self.fp.seek(__offset, __whence)
-
-    def fileno(self) -> int:
-        return self.fp.fileno()
-
-    def tell(self) -> int:
-        return self.fp.tell()
-
-    def __getstate__(self) -> dict[str, t.Any]:
-        d = attrs.asdict(self)
-        if self._fp is not None:
-            d["_pos"] = self._fp.tell()
-            self._fp.seek(0)
-            d["_fp"] = self._fp.read()
-            self._fp.seek(d["_pos"])
-        return d
-
-    def __setstate__(self, d: dict[str, t.Any]) -> None:
-        if d["_fp"] is not None:
-            fp = d["_fp"] = io.BytesIO(d["_fp"])
-            fp.seek(d.pop("_pos", 0))
-        for k, v in d.items():
-            setattr(self, k, v)
-
-
-class Image(File):
-    format: t.ClassVar[str] = "image"
-
-    def to_pil_image(self) -> PILImage.Image:
-        formats = None
-        if self.media_type and self.media_type.startswith("image/"):
-            formats = [self.media_type[6:].upper()]
-        return PILImage.open(self, formats=formats)
-
-
-class Audio(File):
-    format: t.ClassVar[str] = "audio"
-
-
-class Video(File):
-    format: t.ClassVar[str] = "video"
-
 
 @attrs.frozen(unsafe_hash=True)
 class TensorSchema:
-    format: str
+    format: TensorFormat
     dtype: t.Optional[str] = None
     shape: t.Optional[t.Tuple[int, ...]] = None
 
@@ -353,43 +287,16 @@ def _validate(self, obj: t.Any) -> pd.DataFrame:
         return pd.DataFrame(obj, columns=self.columns)
 
 
-@t.overload
-def Tensor(
-    format: Literal["numpy-array"], dtype: str, shape: tuple[int, ...]
-) -> t.Type[np.ndarray[t.Any, t.Any]]:
-    ...
-
-
-@t.overload
-def Tensor(
-    format: Literal["tf-tensor"], dtype: str, shape: tuple[int, ...]
-) -> t.Type[tf.Tensor]:
-    ...
-
-
-@t.overload
-def Tensor(
-    format: Literal["torch-tensor"], dtype: str, shape: tuple[int, ...]
-) -> t.Type[torch.Tensor]:
-    ...
+@attrs.frozen
+class ContentType(BaseMetadata):
+    content_type: str
 
 
-def Tensor(
-    format: Literal["numpy-array", "torch-tensor", "tf-tensor"],
-    dtype: str | None = None,
-    shape: tuple[int, ...] | None = None,
-) -> type:
-    if format == "numpy-array":
-        annotation = np.ndarray[t.Any, t.Any]
-    elif format == "torch-tensor":
-        annotation = torch.Tensor
-    else:
-        annotation = tf.Tensor
-    return Annotated[annotation, TensorSchema(format, dtype, shape)]
+@attrs.frozen
+class Shape(BaseMetadata):
+    dimensions: tuple[int, ...]
 
 
-def Dataframe(
-    orient: t.Literal["records", "columns"] = "records",
-    columns: list[str] | None = None,
-) -> t.Type[pd.DataFrame]:
-    return Annotated[pd.DataFrame, DataframeSchema(orient, columns)]
+@attrs.frozen
+class DType(BaseMetadata):
+    dtype: str
diff --git a/src/_bentoml_sdk/service/__init__.py b/src/_bentoml_sdk/service/__init__.py
new file mode 100644
index 00000000000..d8da0ec9879
--- /dev/null
+++ b/src/_bentoml_sdk/service/__init__.py
@@ -0,0 +1,7 @@
+from .config import ServiceConfig
+from .dependency import depends
+from .factory import Service
+from .factory import runner_service
+from .factory import service
+
+__all__ = ["Service", "service", "runner_service", "depends", "ServiceConfig"]
diff --git a/src/bentoml_io/config.py b/src/_bentoml_sdk/service/config.py
similarity index 100%
rename from src/bentoml_io/config.py
rename to src/_bentoml_sdk/service/config.py
diff --git a/src/bentoml_io/dependency.py b/src/_bentoml_sdk/service/dependency.py
similarity index 90%
rename from src/bentoml_io/dependency.py
rename to src/_bentoml_sdk/service/dependency.py
index cda78b11ff3..7bdf01b6eed 100644
--- a/src/bentoml_io/dependency.py
+++ b/src/_bentoml_sdk/service/dependency.py
@@ -30,7 +30,7 @@ def get(
             BentoMLContainer.remote_runner_mapping
         ],
     ) -> T:
-        from .client.proxy import RemoteProxy
+        from _bentoml_impl.client.proxy import RemoteProxy
 
         key = self.cache_key()
         if key not in _dependent_cache:
@@ -60,7 +60,5 @@ def __getattr__(self, name: str) -> t.Any:
 
 def depends(on: Service[T]) -> Dependency[T]:
     if not isinstance(on, Service):
-        raise TypeError(
-            "depends() expects a class decorated with @bentoml_io.service()"
-        )
+        raise TypeError("depends() expects a class decorated with @bentoml.service()")
     return Dependency(on)
diff --git a/src/bentoml_io/factory.py b/src/_bentoml_sdk/service/factory.py
similarity index 98%
rename from src/bentoml_io/factory.py
rename to src/_bentoml_sdk/service/factory.py
index 2b25ef38ec8..c9f55c5f696 100644
--- a/src/bentoml_io/factory.py
+++ b/src/_bentoml_sdk/service/factory.py
@@ -21,7 +21,7 @@
 from bentoml._internal.utils import dict_filter_none
 from bentoml.exceptions import BentoMLException
 
-from .api import APIMethod
+from ..api import APIMethod
 from .config import ServiceConfig as Config
 from .config import validate
 
@@ -75,7 +75,7 @@ class Service(t.Generic[T]):
     )
     # service context
     context: ServiceContext = attrs.field(init=False, factory=ServiceContext)
-    _working_dir: str = attrs.field(init=False, factory=os.getcwd)
+    working_dir: str = attrs.field(init=False, factory=os.getcwd)
     # import info
     _caller_module: str = attrs.field(init=False)
     _import_str: str | None = attrs.field(init=False, default=None)
@@ -260,14 +260,13 @@ def serve_http(
         development_mode: bool = False,
         reload: bool = False,
     ) -> None:
+        from _bentoml_impl.server import serve_http
         from bentoml._internal.log import configure_logging
 
-        from .server.serving import serve_http
-
         configure_logging()
 
         if working_dir is None:
-            working_dir = self._working_dir
+            working_dir = self.working_dir
         serve_http(
             self,
             working_dir=working_dir,
diff --git a/src/bentoml_io/openapi.py b/src/_bentoml_sdk/service/openapi.py
similarity index 100%
rename from src/bentoml_io/openapi.py
rename to src/_bentoml_sdk/service/openapi.py
diff --git a/src/bentoml_io/typing_utils.py b/src/_bentoml_sdk/typing_utils.py
similarity index 100%
rename from src/bentoml_io/typing_utils.py
rename to src/_bentoml_sdk/typing_utils.py
diff --git a/src/bentoml/__init__.py b/src/bentoml/__init__.py
index 8b08164e7e4..1cbc610e842 100644
--- a/src/bentoml/__init__.py
+++ b/src/bentoml/__init__.py
@@ -15,6 +15,7 @@
 """
 
 from typing import TYPE_CHECKING
+from typing import Any
 
 from ._internal.configuration import BENTOML_VERSION as __version__
 from ._internal.configuration import load_config
@@ -24,6 +25,8 @@
 # Inject dependencies and configurations
 load_config()
 
+from pydantic import Field
+
 # BentoML built-in types
 from ._internal.bento import Bento
 from ._internal.cloud import YataiClient
@@ -98,6 +101,15 @@
     from . import cloud  # Cloud API
 
     # isort: on
+    from _bentoml_impl.client import AsyncHTTPClient
+    from _bentoml_impl.client import SyncHTTPClient
+    from _bentoml_sdk import ContentType
+    from _bentoml_sdk import DType
+    from _bentoml_sdk import Shape
+    from _bentoml_sdk import api
+    from _bentoml_sdk import depends
+    from _bentoml_sdk import runner_service
+    from _bentoml_sdk import service
 else:
     from ._internal.utils import LazyLoader as _LazyLoader
 
@@ -157,6 +169,30 @@
 
     del _LazyLoader
 
+    _NEW_SDK_ATTRS = [
+        "service",
+        "runner_service",
+        "api",
+        "depends",
+        "ContentType",
+        "Dtype",
+        "Shape",
+    ]
+    _NEW_CLIENTS = ["SyncHTTPClient", "AsyncHTTPClient"]
+
+    def __getattr__(name: str) -> Any:
+        if name not in _NEW_SDK_ATTRS + _NEW_CLIENTS:
+            raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+        import _bentoml_sdk
+
+        if name in _NEW_CLIENTS:
+            import _bentoml_impl.client
+
+            return getattr(_bentoml_impl.client, name)
+        else:
+            return getattr(_bentoml_sdk, name)
+
 
 __all__ = [
     "__version__",
@@ -229,4 +265,16 @@
     "set_serialization_strategy",
     "Strategy",
     "Resource",
+    # new SDK
+    "service",
+    "runner_service",
+    "api",
+    "depends",
+    "ContentType",
+    "DType",
+    "Shape",
+    "Field",
+    # new implementation
+    "SyncHTTPClient",
+    "AsyncHTTPClient",
 ]
diff --git a/src/bentoml/_internal/bento/bento.py b/src/bentoml/_internal/bento/bento.py
index 6bde11e3d7f..56b2605e380 100644
--- a/src/bentoml/_internal/bento/bento.py
+++ b/src/bentoml/_internal/bento/bento.py
@@ -45,11 +45,12 @@
 from .build_config import PythonOptions
 
 if TYPE_CHECKING:
-    from bentoml_io.api import APIMethod
-    from bentoml_io.config import ServiceConfig
-    from bentoml_io.server import Service as NewService
     from fs.base import FS
 
+    from _bentoml_sdk import Service as NewService
+    from _bentoml_sdk.api import APIMethod
+    from _bentoml_sdk.service import ServiceConfig
+
     from ..models import Model
     from ..service import Service
     from ..service.inference_api import InferenceAPI
@@ -558,19 +559,14 @@ def from_dependency(
         )
 
 
-def get_service_import_str(svc: Service | str):
+def get_service_import_str(svc: Service | NewService[t.Any] | str) -> str:
     from ..service import Service
 
     if isinstance(svc, Service):
         return svc.get_service_import_origin()[0]
-    try:
-        from bentoml_io.server import Service as NewService
-
-        if isinstance(svc, NewService):
-            return svc.import_string
-    except ImportError:
-        pass
-    return svc
+    if isinstance(svc, str):
+        return svc
+    return svc.import_string
 
 
 @attr.frozen(repr=False)
diff --git a/src/bentoml/_internal/cloud/schemas.py b/src/bentoml/_internal/cloud/schemas.py
index d60a55a7cf5..c7a4aeb48d7 100644
--- a/src/bentoml/_internal/cloud/schemas.py
+++ b/src/bentoml/_internal/cloud/schemas.py
@@ -15,7 +15,7 @@
 T = t.TypeVar("T")
 
 if TYPE_CHECKING:
-    from bentoml_io.config import ServiceConfig
+    from _bentoml_impl.config import ServiceConfig
 else:
     ServiceConfig = t.Dict[str, t.Any]
 
diff --git a/src/bentoml/_internal/context.py b/src/bentoml/_internal/context.py
index 548f4fbf687..6392043dfcb 100644
--- a/src/bentoml/_internal/context.py
+++ b/src/bentoml/_internal/context.py
@@ -3,6 +3,7 @@
 import contextlib
 import contextvars
 import os
+import tempfile
 import typing as t
 from abc import ABC
 from abc import abstractmethod
@@ -17,6 +18,11 @@
     import starlette.requests
     import starlette.responses
 
+# A request-unique directory for storing temporary files
+request_directory: contextvars.ContextVar[str] = contextvars.ContextVar(
+    "request_directory"
+)
+
 
 class Metadata(t.Mapping[str, str], ABC):
     @abstractmethod
@@ -92,11 +98,14 @@ def in_request(
             ServiceContext.RequestContext.from_http(request)
         )
         response_token = self._response_var.set(ServiceContext.ResponseContext())
-        try:
-            yield self
-        finally:
-            self._request_var.reset(request_token)
-            self._response_var.reset(response_token)
+        with tempfile.TemporaryDirectory(prefix="bentoml-request-") as temp_dir:
+            dir_token = request_directory.set(temp_dir)
+            try:
+                yield self
+            finally:
+                self._request_var.reset(request_token)
+                self._response_var.reset(response_token)
+                request_directory.reset(dir_token)
 
     @property
     def request(self) -> RequestContext:
@@ -106,6 +115,10 @@ def request(self) -> RequestContext:
     def response(self) -> ResponseContext:
         return self._response_var.get()
 
+    @property
+    def directory(self) -> str:
+        return request_directory.get()
+
     @attr.define
     class RequestContext:
         metadata: Metadata
diff --git a/src/bentoml/_internal/service/loader.py b/src/bentoml/_internal/service/loader.py
index e1b729de055..8b0e4cf4e85 100644
--- a/src/bentoml/_internal/service/loader.py
+++ b/src/bentoml/_internal/service/loader.py
@@ -26,7 +26,7 @@
 from .service import on_load_bento
 
 if TYPE_CHECKING:
-    from bentoml_io.server import Service as NewService
+    from _bentoml_sdk import Service as NewService
 
     from ..bento import BentoStore
     from .service import Service
@@ -66,7 +66,7 @@ def import_service(
 
     service_types: list[type] = [Service]
     try:
-        from bentoml_io import Service as NewService
+        from _bentoml_sdk import Service as NewService
 
         service_types.append(NewService)
     except ImportError:
diff --git a/src/bentoml/_internal/service/service.py b/src/bentoml/_internal/service/service.py
index 1acfaff5b9a..c41dfed797b 100644
--- a/src/bentoml/_internal/service/service.py
+++ b/src/bentoml/_internal/service/service.py
@@ -28,9 +28,9 @@
 
 if t.TYPE_CHECKING:
     import grpc
-    from bentoml_io.server import Service as NewService
 
     import bentoml
+    from _bentoml_sdk import Service as NewService
     from bentoml.grpc.types import AddServicerFn
     from bentoml.grpc.types import ServicerClass
     from bentoml.triton import _TritonRunner
diff --git a/src/bentoml/_internal/utils/analytics/usage_stats.py b/src/bentoml/_internal/utils/analytics/usage_stats.py
index 06aa25b28c3..08253bedd05 100644
--- a/src/bentoml/_internal/utils/analytics/usage_stats.py
+++ b/src/bentoml/_internal/utils/analytics/usage_stats.py
@@ -31,9 +31,9 @@
     T = t.TypeVar("T")
     AsyncFunc = t.Callable[P, t.Coroutine[t.Any, t.Any, t.Any]]
 
-    from bentoml_io import Service as NewService
     from prometheus_client.samples import Sample
 
+    from _bentoml_sdk import Service as NewService
     from bentoml import Service
 
     from ...server.metrics.prometheus import PrometheusClient
diff --git a/src/bentoml/bentos.py b/src/bentoml/bentos.py
index 950392f990a..f8f044c522c 100644
--- a/src/bentoml/bentos.py
+++ b/src/bentoml/bentos.py
@@ -501,7 +501,7 @@ def serve(
     max_concurrent_streams: int
     | None = Provide[BentoMLContainer.grpc.max_concurrent_streams],
     grpc_protocol_version: str | None = None,
-) -> Server:
+) -> Server[t.Any]:
     logger.warning(
         "bentoml.serve and bentoml.bentos.serve are deprecated; use bentoml.Server instead."
     )
diff --git a/src/bentoml/server.py b/src/bentoml/server.py
index 48dd3101401..14716d7d1b7 100644
--- a/src/bentoml/server.py
+++ b/src/bentoml/server.py
@@ -30,6 +30,8 @@
 if TYPE_CHECKING:
     from types import TracebackType
 
+    from _bentoml_sdk import Service as NewService
+
     _FILE: t.TypeAlias = None | int | t.IO[t.Any]
 
 
@@ -43,7 +45,7 @@
 
 
 class Server(ABC, t.Generic[ClientType]):
-    servable: str | Bento | Tag | Service
+    servable: str | Bento | Tag | Service | NewService[t.Any]
     host: str
     port: int
 
@@ -54,7 +56,7 @@ class Server(ABC, t.Generic[ClientType]):
 
     def __init__(
         self,
-        servable: str | Bento | Tag | Service,
+        servable: str | Bento | Tag | Service | NewService[t.Any],
         serve_cmd: str,
         reload: bool,
         production: bool,
@@ -94,6 +96,9 @@ def __init__(
                     "Cannot use 'bentoml.Service' as a server if it is defined in interactive session or Jupyter Notebooks."
                 )
             bento_str, working_dir = servable.get_service_import_origin()
+        elif not isinstance(servable, str):
+            bento_str = servable.import_string
+            working_dir = servable.working_dir
         else:
             bento_str = servable
 
@@ -219,13 +224,15 @@ def get_client(self) -> ClientType:
             if self.process.stdout is not None and not self.process.stdout.closed:
                 s = self.process.stdout.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
+                    s.decode("utf-8") if isinstance(s, bytes) else s,
+                    " " * 4,  # type: ignore  # may be string
                 )
             if self.process.stderr is not None and not self.process.stderr.closed:
                 logs += "\nServer Error:\n"
                 s = self.process.stderr.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
+                    s.decode("utf-8") if isinstance(s, bytes) else s,
+                    " " * 4,  # type: ignore  # may be string
                 )
             raise ServerStateException(logs)
         return self._get_client()
@@ -253,13 +260,15 @@ def stop(self) -> None:
             if self.process.stdout is not None and not self.process.stdout.closed:
                 s = self.process.stdout.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
+                    s.decode("utf-8") if isinstance(s, bytes) else s,
+                    " " * 4,  # type: ignore  # may be string
                 )
             if self.process.stderr is not None and not self.process.stderr.closed:
                 logs += "\nServer Error:\n"
                 s = self.process.stderr.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
+                    s.decode("utf-8") if isinstance(s, bytes) else s,
+                    " " * 4,  # type: ignore  # may be string
                 )
             logger.warning(logs)
             return
diff --git a/src/bentoml/types.py b/src/bentoml/types.py
index c380486d9c1..ca9e427a92c 100644
--- a/src/bentoml/types.py
+++ b/src/bentoml/types.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from ._internal.models.model import ModelSignature
 from ._internal.types import ModelSignatureDict
 
diff --git a/src/bentoml_cli/serve.py b/src/bentoml_cli/serve.py
index 0c2d96b08d7..ef0fc24c693 100644
--- a/src/bentoml_cli/serve.py
+++ b/src/bentoml_cli/serve.py
@@ -279,7 +279,7 @@ def serve(  # type: ignore (unused warning)
                     development_mode=False,
                 )
         else:
-            from bentoml_io.server.serving import serve_http
+            from _bentoml_impl.server import serve_http
 
             svc.inject_config()
             serve_http(
diff --git a/src/bentoml_cli/start.py b/src/bentoml_cli/start.py
index cb6fa3652ab..59aed2a314b 100644
--- a/src/bentoml_cli/start.py
+++ b/src/bentoml_cli/start.py
@@ -194,7 +194,7 @@ def start_http_server(  # type: ignore (unused warning)
                 ssl_ciphers=ssl_ciphers,
             )
         else:
-            from bentoml_io.server.serving import serve_http
+            from _bentoml_impl.server import serve_http
 
             svc.inject_config()
             serve_http(
diff --git a/src/bentoml_io/__init__.py b/src/bentoml_io/__init__.py
deleted file mode 100644
index ea5d34ac243..00000000000
--- a/src/bentoml_io/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from bentoml._internal.utils.pkg import pkg_version_info
-
-if (ver := pkg_version_info("pydantic")) < (2,):
-    raise ImportError(
-        f"bentoml_io runs on pydantic>=2.0.0, but the you have {'.'.join(map(str, ver))}. "
-        "Please upgrade it."
-    )
-
-from ._pydantic import add_custom_preparers
-
-add_custom_preparers()
-del add_custom_preparers
-# ruff: noqa
-# Re-export models for compatibility
-from bentoml import models as models
-
-from .api import api as api
-from .dependency import depends as depends
-from .factory import Service as Service
-from .factory import service as service
-from .factory import runner_service as runner_service
-from .io_models import IODescriptor as IODescriptor
diff --git a/tests/unit/bentoml_cli/test_env_manager.py b/tests/unit/bentoml_cli/test_env_manager.py
index b6b1f9dd248..fdb166e6f66 100644
--- a/tests/unit/bentoml_cli/test_env_manager.py
+++ b/tests/unit/bentoml_cli/test_env_manager.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import pytest
+
 from bentoml_cli.env_manager import remove_env_arg
 
 testdata = [
diff --git a/tests/unit/bentoml_io/test_allocator.py b/tests/unit/bentoml_io/test_allocator.py
index 8e55a12313e..07dd34b7919 100644
--- a/tests/unit/bentoml_io/test_allocator.py
+++ b/tests/unit/bentoml_io/test_allocator.py
@@ -1,14 +1,15 @@
 from unittest import mock
 
 import pytest
-from bentoml_io import service
-from bentoml_io.server.allocator import BentoMLConfigException
-from bentoml_io.server.allocator import ResourceAllocator
-from bentoml_io.server.allocator import ResourceUnavailable
+
+from _bentoml_impl.server.allocator import BentoMLConfigException
+from _bentoml_impl.server.allocator import ResourceAllocator
+from _bentoml_impl.server.allocator import ResourceUnavailable
+from _bentoml_sdk import service
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_assign_gpus(_):
@@ -28,7 +29,7 @@ def test_assign_gpus(_):
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_assign_gpus_float(_):
@@ -48,7 +49,7 @@ def test_assign_gpus_float(_):
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_gpu(_):
@@ -71,7 +72,7 @@ class Foo:
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_gpu_float(_):
@@ -101,7 +102,7 @@ class Bar:
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_cpu_count(_):
@@ -118,7 +119,7 @@ class Foo:
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_worker_number(_):
@@ -150,7 +151,7 @@ class Bar:
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_worker_gpu(_):
@@ -180,7 +181,7 @@ class Bar:
 
 
 @mock.patch(
-    "bentoml_io.server.allocator.system_resources",
+    "_bentoml_impl.server.allocator.system_resources",
     return_value={"cpu": 8, "nvidia.com/gpu": list(range(4))},
 )
 def test_get_worker_env_gpu_id(_):
diff --git a/webui/scripts/copy-lib.cjs b/webui/scripts/copy-lib.cjs
index 67570abfc60..8904c8afae6 100644
--- a/webui/scripts/copy-lib.cjs
+++ b/webui/scripts/copy-lib.cjs
@@ -1,7 +1,7 @@
 const path = require('node:path')
 const fs = require('fs-extra')
 
-const targetPath = path.resolve(__dirname, '../../src/bentoml_io/server/assets')
+const targetPath = path.resolve(__dirname, '../../src/_bentoml_impl/server/assets')
 const outDir = path.resolve(__dirname, '../lib/assets')
 
 async function main() {
diff --git a/webui/src/components/code/Python.tsx b/webui/src/components/code/Python.tsx
index 3d7e07e76d3..6eea5b80491 100644
--- a/webui/src/components/code/Python.tsx
+++ b/webui/src/components/code/Python.tsx
@@ -1,6 +1,6 @@
 import { StyledLink } from 'baseui/link'
 import isEmpty from 'lodash/isEmpty'
-import { isFileField } from '../../hooks/useQuery'
+import { isFileField, hasFileInSchema } from '../../hooks/useQuery'
 import type { DataType, TObject } from '../../types'
 import { useMountOptions } from '../../hooks/useMountOptions'
 import type { IClientProps } from './Base'
@@ -16,11 +16,11 @@ function formatValue(value: unknown, schema?: DataType, indent = 4) {
         return '[]'
 
       return `[\n${(value as File[]).map(
-        item => `${' '.repeat(indent + 4)}open("${item.name}", "rb")`,
+        item => `${' '.repeat(indent + 4)}Path("${item.name}")`,
       ).join(',\n')},\n${' '.repeat(indent)}]`
     }
     else {
-      return `open("${(value as File).name}", "rb")`
+      return `Path("${(value as File).name}")`
     }
   }
   else {
@@ -43,9 +43,9 @@ function formatQuery(data: object, schema: TObject, indent = 4) {
 function generateCode(data: object, path = '/', schema?: TObject, needAuth?: boolean) {
   const auth = needAuth ? `, token="******"` : ''
 
-  return `from bentoml_io.client import SyncHTTPClient
-
-with SyncHTTPClient("http://localhost:3000"${auth}) as client:
+  return `import bentoml
+${hasFileInSchema(schema ? { schema } : {}) ? 'from pathlib import Path\n' : ''}
+with bentoml.SyncHTTPClient("http://localhost:3000"${auth}) as client:
     result = client.${path.slice(1)}(${formatQuery(data, schema!, 4)})
 `
 }