From 0730f56afb68b2d7e41fe070b7cd634cfad46997 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Jul 2024 14:01:40 -0400 Subject: [PATCH] Remove pyarrow as a direct dependency (#2228) Signed-off-by: Thomas J. Fan --- .github/workflows/pythonbuild.yml | 4 ++-- dev-requirements.in | 1 + pyproject.toml | 1 - tests/flytekit/unit/core/test_type_engine.py | 3 ++- tests/flytekit/unit/deck/test_renderer.py | 3 ++- tests/flytekit/unit/lazy_module/test_lazy_module.py | 4 ++-- .../flytekit/unit/types/structured_dataset/test_arrow_data.py | 3 ++- .../unit/types/structured_dataset/test_structured_dataset.py | 2 +- .../structured_dataset/test_structured_dataset_handlers.py | 2 +- .../structured_dataset/test_structured_dataset_workflow.py | 4 ++-- .../test_structured_dataset_workflow_with_nested_type.py | 2 +- 11 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml index 005658497b..10bbe3aa10 100644 --- a/.github/workflows/pythonbuild.yml +++ b/.github/workflows/pythonbuild.yml @@ -59,7 +59,7 @@ jobs: run: | pip install uv make setup-global-uv - uv pip uninstall --system pandas + uv pip uninstall --system pandas pyarrow uv pip freeze - name: Test with coverage run: | @@ -98,7 +98,7 @@ jobs: run: | pip install uv make setup-global-uv - uv pip uninstall --system pandas + uv pip uninstall --system pandas pyarrow uv pip freeze - name: Run extras unit tests with coverage # Skip this step if running on python 3.12 due to https://github.com/tensorflow/tensorflow/issues/62003 diff --git a/dev-requirements.in b/dev-requirements.in index 2c91767a01..b2cec23dc7 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -50,6 +50,7 @@ autoflake pillow numpy pandas +pyarrow scikit-learn types-requests prometheus-client diff --git a/pyproject.toml b/pyproject.toml index cd11580f5a..e5a5f21137 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,6 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.11", "protobuf!=4.25.0", - "pyarrow", "pygments", "python-json-logger>=2.0.0", "pytimeparse>=1.1.8", diff --git a/tests/flytekit/unit/core/test_type_engine.py b/tests/flytekit/unit/core/test_type_engine.py index 0baf81c223..9ce7330ccd 100644 --- a/tests/flytekit/unit/core/test_type_engine.py +++ b/tests/flytekit/unit/core/test_type_engine.py @@ -12,7 +12,6 @@ from typing import List, Optional, Type import mock -import pyarrow as pa import pytest import typing_extensions from dataclasses_json import DataClassJsonMixin, dataclass_json @@ -1408,9 +1407,11 @@ class UnsupportedEnumValues(Enum): BLUE = 3 +@pytest.mark.skipif("polars" not in sys.modules, reason="pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") def test_structured_dataset_type(): import pandas as pd + import pyarrow as pa from pandas._testing import assert_frame_equal name = "Name" diff --git a/tests/flytekit/unit/deck/test_renderer.py b/tests/flytekit/unit/deck/test_renderer.py index 7263139acc..993e5cf2c4 100644 --- a/tests/flytekit/unit/deck/test_renderer.py +++ b/tests/flytekit/unit/deck/test_renderer.py @@ -1,11 +1,11 @@ import sys -import pyarrow as pa import pytest from flytekit.deck.renderer import DEFAULT_MAX_COLS, DEFAULT_MAX_ROWS, ArrowRenderer, TopFrameRenderer +@pytest.mark.skipif("pyarrow" not in sys.modules, reason="Pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") @pytest.mark.parametrize( "rows, cols, max_rows, expected_max_rows, max_cols, expected_max_cols", @@ -23,6 +23,7 @@ ) def test_renderer(rows, cols, max_rows, expected_max_rows, max_cols, expected_max_cols): import pandas as pd + import pyarrow as pa df = pd.DataFrame({f"abc-{k}": list(range(rows)) for k in range(cols)}) pa_df = pa.Table.from_pandas(df) diff --git a/tests/flytekit/unit/lazy_module/test_lazy_module.py b/tests/flytekit/unit/lazy_module/test_lazy_module.py index 714b3052e7..83c0fb86a7 100644 --- a/tests/flytekit/unit/lazy_module/test_lazy_module.py +++ b/tests/flytekit/unit/lazy_module/test_lazy_module.py @@ -4,8 +4,8 @@ def test_lazy_module(): - mod = lazy_module("pyarrow") - assert mod.__name__ == "pyarrow" + mod = lazy_module("click") + assert mod.__name__ == "click" mod = lazy_module("fake_module") assert isinstance(mod, LazyModule) with pytest.raises(ImportError, match="Module fake_module is not yet installed."): diff --git a/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py b/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py index 9df8c9ba4b..05ca7aedd2 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py +++ b/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py @@ -1,16 +1,17 @@ import sys import typing -import pyarrow as pa import pytest from typing_extensions import Annotated from flytekit import kwtypes, task +@pytest.mark.skipif("pyarrow" not in sys.modules, reason="Pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") def test_structured_dataset_wf(): import pandas as pd + import pyarrow as pa cols = kwtypes(Name=str, Age=int) subset_cols = kwtypes(Name=str) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py index 8b82d0564a..9e29416523 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py @@ -4,7 +4,6 @@ from collections import OrderedDict import google.cloud.bigquery -import pyarrow as pa import pytest from fsspec.utils import get_protocol from typing_extensions import Annotated @@ -34,6 +33,7 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") my_cols = kwtypes(w=typing.Dict[str, typing.Dict[str, int]], x=typing.List[typing.List[int]], y=int, z=str) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py index b18da019ee..a9f3901bd0 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py @@ -1,7 +1,6 @@ import typing import mock -import pyarrow as pa import pytest from flytekit.core import context_manager @@ -17,6 +16,7 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") my_cols = kwtypes(w=typing.Dict[str, typing.Dict[str, int]], x=typing.List[typing.List[int]], y=int, z=str) fields = [("some_int", pa.int32()), ("some_string", pa.string())] arrow_schema = pa.schema(fields) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py index 91fa72b526..e8233b3085 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py @@ -3,8 +3,6 @@ from dataclasses import dataclass import numpy as np -import pyarrow as pa -import pyarrow.parquet as pq import pytest from typing_extensions import Annotated @@ -24,6 +22,8 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") +pq = pytest.importorskip("pyarrow.parquet") PANDAS_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() NUMPY_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py index 62c0f6d651..0d28a2707f 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py @@ -1,12 +1,12 @@ from dataclasses import dataclass -import pyarrow as pa import pytest from typing_extensions import Annotated from flytekit import FlyteContextManager, StructuredDataset, kwtypes, task, workflow pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") PANDAS_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() NUMPY_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory()