Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace pandas with fireducks #35

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/gtfs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import date
from tempfile import TemporaryDirectory
import pandas as pd
import fireducks.pandas as pd
from typing import Iterable
import boto3

Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/historic/gtfs_archive.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import datetime
import pandas as pd
import fireducks.pandas as pd
import pathlib
import shutil
import urllib.request
Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/historic/process.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandas as pd
import fireducks.pandas as pd
import pathlib
from .constants import HISTORIC_COLUMNS_PRE_LAMP as HISTORIC_COLUMNS
from .gtfs_archive import add_gtfs_headways
Expand Down
4 changes: 2 additions & 2 deletions mbta-performance/chalicelib/lamp/backfill/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import pandas as pd
import fireducks.pandas as pd
from ..ingest import fetch_pq_file_from_remote, ingest_pq_file, upload_to_s3
from ... import parallel
from datetime import date, timedelta


_parallel_upload = parallel.make_parallel(upload_to_s3)

EARLIEST_LAMP_DATA = date(2019, 9, 15)
EARLIEST_LAMP_DATA = date(2023, 10, 31)


def backfill_all_in_index():
Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/lamp/ingest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import date
import io
import pandas as pd
import fireducks.pandas as pd
import requests
from typing import Tuple

Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/lamp/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unittest
from unittest import mock

import pandas as pd
import fireducks.pandas as pd

from .. import ingest
from .. import constants
Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/parallel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import fireducks.pandas as pd


def make_parallel(single_func, THREAD_COUNT=5):
Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/s3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import boto3
import io
import pandas as pd
import fireducks.pandas as pd
import zlib
import time

Expand Down
1,305 changes: 726 additions & 579 deletions poetry.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ package-mode = false
python = "~3.12"
requests = "^2.32.3"
boto3 = "~1.34.162"
pandas = "^2.2.2"
datadog_lambda = "6.98.0"
pyarrow = "~17.0.0"
fireducks = "^1.1.0"
datadog_lambda = "6.104.0"
pyarrow = "~18.0.0"
mbta-gtfs-sqlite = "^1.1.1"

[tool.poetry.dev-dependencies]
chalice = "^1.31.2"
flake8 = "^7.1.1"
black = "^24.8.0"
pytest = "~8.3.2"
black = "^24.10.0"
pytest = "~8.3.3"

[tool.black]
line-length = 120
Expand Down
Loading