Skip to content

Commit

Permalink
Fix content type annotations for pandas codecs
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrian Gonzalez-Martin committed May 23, 2023
1 parent d252bd4 commit 28e3619
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 22 deletions.
57 changes: 43 additions & 14 deletions mlserver/codecs/pandas.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import pandas as pd
import numpy as np

from typing import Optional, Any, List
from typing import Optional, Any, List, Tuple

from .base import RequestCodec, register_request_codec
from .numpy import to_datatype, to_dtype
from .string import encode_str
from .string import encode_str, StringCodec
from .utils import get_decoded_or_raw, InputOrOutput, inject_batch_dimension
from .lists import ListElement
from ..types import InferenceRequest, InferenceResponse, RequestInput, ResponseOutput
from ..types import (
InferenceRequest,
InferenceResponse,
RequestInput,
ResponseOutput,
Parameters,
)


def _to_series(input_or_output: InputOrOutput) -> pd.Series:
Expand All @@ -29,28 +35,46 @@ def _to_series(input_or_output: InputOrOutput) -> pd.Series:
def _to_response_output(series: pd.Series, use_bytes: bool = True) -> ResponseOutput:
datatype = to_datatype(series.dtype)
data = series.tolist()
content_type = None

if datatype == "BYTES" and use_bytes:
# To ensure that "string" columns can be encoded in gRPC, we need to
# encode them as bytes
data = list(map(_ensure_bytes, data))
if datatype == "BYTES":
data, content_type = _process_bytes(data, use_bytes)

shape = inject_batch_dimension(list(series.shape))
parameters = None
if content_type:
parameters = Parameters(content_type=content_type)

return ResponseOutput(
name=series.name,
shape=shape,
# If string, it should be encoded to bytes
data=data,
datatype=datatype,
parameters=parameters,
)


def _ensure_bytes(elem: ListElement) -> bytes:
if isinstance(elem, str):
return encode_str(elem)
def _process_bytes(
data: List[ListElement], use_bytes: bool = True
) -> Tuple[List[ListElement], bool]:
# To ensure that "string" columns can be encoded in gRPC, we need to
# encode them as bytes.
# We'll also keep track of whether the list should be treated in the
# future as a list of strings.
processed = []
content_type = StringCodec.ContentType
for elem in data:
converted = elem
if not isinstance(elem, str):
# There was a non-string element, so we can't determine a content
# type
content_type = None
elif use_bytes:
converted = encode_str(elem)

return elem
processed.append(converted)

return processed, content_type


@register_request_codec
Expand Down Expand Up @@ -79,7 +103,10 @@ def encode_response(
outputs = cls.encode_outputs(payload, use_bytes=use_bytes)

return InferenceResponse(
model_name=model_name, model_version=model_version, outputs=outputs
model_name=model_name,
model_version=model_version,
parameters=Parameters(content_type=cls.ContentType),
outputs=outputs,
)

@classmethod
Expand All @@ -106,15 +133,17 @@ def encode_request(
outputs = cls.encode_outputs(payload, use_bytes=use_bytes)

return InferenceRequest(
parameters=Parameters(content_type=cls.ContentType),
inputs=[
RequestInput(
name=output.name,
datatype=output.datatype,
shape=output.shape,
data=output.data,
parameters=output.parameters,
)
for output in outputs
]
],
)

@classmethod
Expand Down
24 changes: 21 additions & 3 deletions tests/codecs/test_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,27 @@ def test_decode_request_not_found(
[PandasCodec],
[
ResponseOutput(name="a", datatype="INT64", shape=[1, 1], data=[2]),
ResponseOutput(name="b", datatype="BYTES", shape=[1, 1], data=[b"foo"]),
ResponseOutput(
name="b",
datatype="BYTES",
shape=[1, 1],
data=[b"foo"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
(
(pd.DataFrame({"a": [2], "b": ["foo"]}), ["bar"]),
[PandasCodec, StringCodec],
[
ResponseOutput(name="a", datatype="INT64", shape=[1, 1], data=[2]),
ResponseOutput(name="b", datatype="BYTES", shape=[1, 1], data=[b"foo"]),
ResponseOutput(
name="b",
datatype="BYTES",
shape=[1, 1],
data=[b"foo"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
ResponseOutput(
name="output-1",
datatype="BYTES",
Expand All @@ -240,7 +252,13 @@ def test_decode_request_not_found(
data=[3],
),
ResponseOutput(name="a", datatype="INT64", shape=[1, 1], data=[2]),
ResponseOutput(name="b", datatype="BYTES", shape=[1, 1], data=[b"foo"]),
ResponseOutput(
name="b",
datatype="BYTES",
shape=[1, 1],
data=[b"foo"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
],
Expand Down
36 changes: 31 additions & 5 deletions tests/codecs/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any

from mlserver.codecs.pandas import PandasCodec, _to_response_output
from mlserver.codecs.string import StringCodec
from mlserver.types import (
InferenceRequest,
InferenceResponse,
Expand Down Expand Up @@ -32,14 +33,22 @@ def test_can_encode(payload: Any, expected: bool):
pd.Series(data=["hey", "abc"], name="foo"),
True,
ResponseOutput(
name="foo", shape=[2, 1], data=[b"hey", b"abc"], datatype="BYTES"
name="foo",
shape=[2, 1],
data=[b"hey", b"abc"],
datatype="BYTES",
parameters=Parameters(content_type=StringCodec.ContentType),
),
),
(
pd.Series(data=["hey", "abc"], name="foo"),
False,
ResponseOutput(
name="foo", shape=[2, 1], data=["hey", "abc"], datatype="BYTES"
name="foo",
shape=[2, 1],
data=["hey", "abc"],
datatype="BYTES",
parameters=Parameters(content_type=StringCodec.ContentType),
),
),
(
Expand Down Expand Up @@ -82,15 +91,20 @@ def test_to_response_output(series, use_bytes, expected):
True,
InferenceResponse(
model_name="my-model",
parameters=Parameters(content_type=PandasCodec.ContentType),
outputs=[
ResponseOutput(
name="a", shape=[3, 1], datatype="INT64", data=[1, 2, 3]
name="a",
shape=[3, 1],
datatype="INT64",
data=[1, 2, 3],
),
ResponseOutput(
name="b",
shape=[3, 1],
datatype="BYTES",
data=[b"A", b"B", b"C"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
Expand All @@ -105,12 +119,17 @@ def test_to_response_output(series, use_bytes, expected):
False,
InferenceResponse(
model_name="my-model",
parameters=Parameters(content_type=PandasCodec.ContentType),
outputs=[
ResponseOutput(
name="a", shape=[3, 1], datatype="INT64", data=[1, 2, 3]
),
ResponseOutput(
name="b", shape=[3, 1], datatype="BYTES", data=["A", "B", "C"]
name="b",
shape=[3, 1],
datatype="BYTES",
data=["A", "B", "C"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
Expand Down Expand Up @@ -219,6 +238,7 @@ def test_decode_response(response: InferenceResponse, expected: pd.DataFrame):
),
True,
InferenceRequest(
parameters=Parameters(content_type=PandasCodec.ContentType),
inputs=[
RequestInput(
name="a", shape=[3, 1], datatype="INT64", data=[1, 2, 3]
Expand All @@ -228,6 +248,7 @@ def test_decode_response(response: InferenceResponse, expected: pd.DataFrame):
shape=[3, 1],
datatype="BYTES",
data=[b"A", b"B", b"C"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
Expand All @@ -241,12 +262,17 @@ def test_decode_response(response: InferenceResponse, expected: pd.DataFrame):
),
False,
InferenceRequest(
parameters=Parameters(content_type=PandasCodec.ContentType),
inputs=[
RequestInput(
name="a", shape=[3, 1], datatype="INT64", data=[1, 2, 3]
),
RequestInput(
name="b", shape=[3, 1], datatype="BYTES", data=["A", "B", "C"]
name="b",
shape=[3, 1],
datatype="BYTES",
data=["A", "B", "C"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
Expand Down
2 changes: 2 additions & 0 deletions tests/codecs/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def test_encode_response_output(
InferenceResponse(
model_name="sum-model",
model_version="v1.2.3",
parameters=Parameters(content_type=PandasCodec.ContentType),
outputs=[
ResponseOutput(
name="a", datatype="INT64", shape=[3, 1], data=[1, 2, 3]
Expand All @@ -93,6 +94,7 @@ def test_encode_response_output(
datatype="BYTES",
shape=[3, 1],
data=[b"a", b"b", b"c"],
parameters=Parameters(content_type=StringCodec.ContentType),
),
],
),
Expand Down

0 comments on commit 28e3619

Please sign in to comment.