Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
fxmarty committed Jun 11, 2024
1 parent dadfff6 commit 7c74705
Show file tree
Hide file tree
Showing 14 changed files with 92 additions and 6 deletions.
10 changes: 10 additions & 0 deletions integration-tests/models/test_bloom_560m.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
import pytest

from testing_utils import require_backend_async


@pytest.fixture(scope="module")
@require_backend_async("cuda")
def bloom_560_handle(launcher):
with launcher("bigscience/bloom-560m") as handle:
yield handle


@pytest.fixture(scope="module")
@require_backend_async("cuda")
async def bloom_560(bloom_560_handle):
await bloom_560_handle.health(240)
return bloom_560_handle.client


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_bloom_560m(bloom_560, response_snapshot):
# The generated text is different on MI300X, and for what it is worth also different on H100.
response = await bloom_560.generate(
"Pour déguster un ortolan, il faut tout d'abord",
max_new_tokens=10,
Expand All @@ -28,7 +34,9 @@ async def test_bloom_560m(bloom_560, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_bloom_560m_all_params(bloom_560, response_snapshot):
# The generated text is different on MI300X, and for what it is worth also different on H100.
response = await bloom_560.generate(
"Pour déguster un ortolan, il faut tout d'abord",
max_new_tokens=10,
Expand All @@ -50,7 +58,9 @@ async def test_bloom_560m_all_params(bloom_560, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot):
# The generated text is different on MI300X, and for what it is worth also different on H100.
responses = await generate_load(
bloom_560,
"Pour déguster un ortolan, il faut tout d'abord",
Expand Down
4 changes: 4 additions & 0 deletions integration-tests/models/test_bloom_560m_sharded.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from testing_utils import require_backend_async


@pytest.fixture(scope="module")
def bloom_560m_sharded_handle(launcher):
Expand All @@ -14,7 +16,9 @@ async def bloom_560m_sharded(bloom_560m_sharded_handle):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
# The generated text is different on MI300X, and for what it is worth also different on H100.
response = await bloom_560m_sharded.generate(
"Pour déguster un ortolan, il faut tout d'abord",
max_new_tokens=10,
Expand Down
9 changes: 9 additions & 0 deletions integration-tests/models/test_flash_gemma.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
import pytest

from testing_utils import require_backend_async

# These tests do not pass on ROCm, that does not support head_dim > 128 (2b model is 256).


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
def flash_gemma_handle(launcher):
with launcher("google/gemma-2b", num_shard=1) as handle:
yield handle


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
async def flash_gemma(flash_gemma_handle):
await flash_gemma_handle.health(300)
return flash_gemma_handle.client


@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma(flash_gemma, response_snapshot):
response = await flash_gemma.generate(
"Test request", max_new_tokens=10, decoder_input_details=True
Expand All @@ -26,6 +33,7 @@ async def test_flash_gemma(flash_gemma, response_snapshot):

@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
response = await flash_gemma.generate(
"Test request",
Expand All @@ -49,6 +57,7 @@ async def test_flash_gemma_all_params(flash_gemma, response_snapshot):

@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot):
responses = await generate_load(flash_gemma, "Test request", max_new_tokens=10, n=4)

Expand Down
7 changes: 7 additions & 0 deletions integration-tests/models/test_flash_gemma_gptq.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
import pytest

from testing_utils import require_backend_async


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
def flash_gemma_gptq_handle(launcher):
with launcher("TechxGenus/gemma-2b-GPTQ", num_shard=1, quantize="gptq") as handle:
yield handle


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
async def flash_gemma_gptq(flash_gemma_gptq_handle):
await flash_gemma_gptq_handle.health(300)
return flash_gemma_gptq_handle.client


@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapshot):
response = await flash_gemma_gptq.generate(
"Test request", max_new_tokens=10, decoder_input_details=True
Expand All @@ -28,6 +33,7 @@ async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapsh

@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma_gptq_all_params(
flash_gemma_gptq, ignore_logprob_response_snapshot
):
Expand All @@ -53,6 +59,7 @@ async def test_flash_gemma_gptq_all_params(

@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_gemma_gptq_load(
flash_gemma_gptq, generate_load, ignore_logprob_response_snapshot
):
Expand Down
7 changes: 7 additions & 0 deletions integration-tests/models/test_flash_pali_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
import io
import base64

from testing_utils import require_backend_async

# These tests do not pass on ROCm, that does not support head_dim > 128 (2b model is 256).


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
def flash_pali_gemma_handle(launcher):
with launcher(
"google/paligemma-3b-pt-224",
Expand All @@ -17,6 +22,7 @@ def flash_pali_gemma_handle(launcher):


@pytest.fixture(scope="module")
@require_backend_async("cuda", "xpu")
async def flash_pali_gemma(flash_pali_gemma_handle):
await flash_pali_gemma_handle.health(300)
return flash_pali_gemma_handle.client
Expand All @@ -30,6 +36,7 @@ def get_cow_beach():

@pytest.mark.asyncio
@pytest.mark.private
@require_backend_async("cuda", "xpu")
async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
cow = get_cow_beach()
inputs = f"![]({cow})Where is the cow standing?\n"
Expand Down
9 changes: 9 additions & 0 deletions integration-tests/models/test_flash_phi.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
import pytest

from testing_utils import require_backend_async

# These tests do not pass on ROCm, with different generations.


@pytest.fixture(scope="module")
@require_backend_async("cuda")
def flash_phi_handle(launcher):
with launcher("microsoft/phi-2", num_shard=1) as handle:
yield handle


@pytest.fixture(scope="module")
@require_backend_async("cuda")
async def flash_phi(flash_phi_handle):
await flash_phi_handle.health(300)
return flash_phi_handle.client


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_flash_phi(flash_phi, response_snapshot):
response = await flash_phi.generate(
"Test request", max_new_tokens=10, decoder_input_details=True
Expand All @@ -25,6 +32,7 @@ async def test_flash_phi(flash_phi, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_flash_phi_all_params(flash_phi, response_snapshot):
response = await flash_phi.generate(
"Test request",
Expand All @@ -48,6 +56,7 @@ async def test_flash_phi_all_params(flash_phi, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_flash_phi_load(flash_phi, generate_load, response_snapshot):
responses = await generate_load(flash_phi, "Test request", max_new_tokens=10, n=4)

Expand Down
4 changes: 4 additions & 0 deletions integration-tests/models/test_flash_santacoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from testing_utils import require_backend_async


@pytest.fixture(scope="module")
def flash_santacoder_handle(launcher):
Expand All @@ -14,7 +16,9 @@ async def flash_santacoder(flash_santacoder_handle):


@pytest.mark.asyncio
@require_backend_async("cuda", "xpu")
async def test_flash_santacoder(flash_santacoder, response_snapshot):
# TODO: This test does not pass on ROCm although it should. To be investigated.
response = await flash_santacoder.generate(
"def print_hello", max_new_tokens=10, decoder_input_details=True
)
Expand Down
22 changes: 20 additions & 2 deletions integration-tests/models/test_flash_starcoder_gptq.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from testing_utils import SYSTEM, is_flaky_async, require_backend_async


@pytest.fixture(scope="module")
def flash_starcoder_gptq_handle(launcher):
Expand All @@ -14,17 +16,25 @@ async def flash_starcoder_gptq(flash_starcoder_gptq_handle):


@pytest.mark.asyncio
@is_flaky_async(max_attempts=10)
async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot):
response = await flash_starcoder_gptq.generate(
"def geometric_mean(L: List[float]):",
max_new_tokens=20,
decoder_input_details=True,
)
assert response.details.generated_tokens == 20
assert response == generous_response_snapshot
assert (
response.generated_text
== '\n """\n Calculate the geometric mean of a list of numbers.\n\n :param L: List'
)

if SYSTEM != "rocm":
assert response == generous_response_snapshot


@pytest.mark.asyncio
@is_flaky_async(max_attempts=10)
async def test_flash_starcoder_gptq_default_params(
flash_starcoder_gptq, generous_response_snapshot
):
Expand All @@ -37,13 +47,21 @@ async def test_flash_starcoder_gptq_default_params(
seed=0,
)
assert response.details.generated_tokens == 20
assert response == generous_response_snapshot
assert (
response.generated_text == "\n return reduce(lambda x, y: x * y, L) ** (1.0"
)

if SYSTEM != "rocm":
assert response == generous_response_snapshot


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_flash_starcoder_gptq_load(
flash_starcoder_gptq, generate_load, generous_response_snapshot
):
# TODO: exllamav2 gptq kernel is highly non-deterministic on ROCm.

responses = await generate_load(
flash_starcoder_gptq,
"def geometric_mean(L: List[float]):",
Expand Down
6 changes: 5 additions & 1 deletion integration-tests/models/test_llava_next.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pytest
import base64

from testing_utils import SYSTEM


# TODO fix the server parsser to count inline image tokens correctly
def get_chicken():
Expand Down Expand Up @@ -81,4 +83,6 @@ async def test_flash_llava_next_load(
assert len(generated_texts) == 4
assert all([r.generated_text == generated_texts[0] for r in responses])

assert responses == response_snapshot
if SYSTEM != "rocm":
# Logprobs are not strictly identical on AMD GPUs.
assert responses == response_snapshot
7 changes: 7 additions & 0 deletions integration-tests/models/test_mamba.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import pytest

from testing_utils import require_backend_async


@pytest.fixture(scope="module")
@require_backend_async("cuda")
def fused_kernel_mamba_handle(launcher):
with launcher("state-spaces/mamba-130m", num_shard=1) as handle:
yield handle


@pytest.fixture(scope="module")
@require_backend_async("cuda")
async def fused_kernel_mamba(fused_kernel_mamba_handle):
await fused_kernel_mamba_handle.health(300)
return fused_kernel_mamba_handle.client


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_mamba(fused_kernel_mamba, response_snapshot):
response = await fused_kernel_mamba.generate(
"What is Deep Learning?", max_new_tokens=10
Expand All @@ -25,6 +30,7 @@ async def test_mamba(fused_kernel_mamba, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
response = await fused_kernel_mamba.generate(
"blue, red, yellow, ",
Expand All @@ -51,6 +57,7 @@ async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):


@pytest.mark.asyncio
@require_backend_async("cuda")
async def test_mamba_load(
fused_kernel_mamba, generate_load, generous_response_snapshot
):
Expand Down
3 changes: 2 additions & 1 deletion integration-tests/models/test_mt0_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

@pytest.fixture(scope="module")
def mt0_base_handle(launcher):
with launcher("bigscience/mt0-base") as handle:
# We use TP=1 as this model is loaded with AutoModel (sharding not supported).
with launcher("bigscience/mt0-base", num_shard=1) as handle:
yield handle


Expand Down
2 changes: 1 addition & 1 deletion server/text_generation_server/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
out = F.linear(inp, weight)

if batched:
out.view(*inp_shape[:-1], out.shape[-1])
out = out.view(*inp_shape[:-1], out.shape[-1])

if bias is not None:
out = out + bias
Expand Down
Loading

0 comments on commit 7c74705

Please sign in to comment.