Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use new data format #667

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions components/caption_images/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,12 @@ tags:
- Image processing

consumes:
images:
fields:
data:
type: binary
images_data:
type: binary

produces:
captions:
fields:
text:
type: utf8
captions_text:
type: utf8

args:
model_id:
Expand Down
4 changes: 2 additions & 2 deletions components/caption_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
self.max_new_tokens = max_new_tokens

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["images"]["data"]
images = dataframe["images_data"]

results: t.List[pd.Series] = []
for batch in np.split(
Expand All @@ -112,4 +112,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
).T
results.append(captions)

return pd.concat(results).to_frame(name=("captions", "text"))
return pd.concat(results).to_frame(name=("captions_text"))
16 changes: 6 additions & 10 deletions components/chunk_text/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,14 @@ tags:
- Text processing

consumes:
text:
fields:
data:
type: string
text_data:
type: string

produces:
text:
fields:
data:
type: string
original_document_id:
type: string
text_data:
type: string
text_original_document_id:
type: string

args:
chunk_size:
Expand Down
7 changes: 1 addition & 6 deletions components/chunk_text/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
def chunk_text(self, row) -> t.List[t.Tuple]:
# Multi-index df has id under the name attribute
doc_id = row.name
text_data = row[("text", "data")]
text_data = row[("text_data")]
docs = self.text_splitter.create_documents([text_data])
return [
(doc_id, f"{doc_id}_{chunk_id}", chunk.page_content)
Expand All @@ -63,9 +63,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
)
results_df = results_df.set_index("id")

# Set multi-index column for the expected subset and field
results_df.columns = pd.MultiIndex.from_product(
[["text"], results_df.columns],
)

return results_df
8 changes: 4 additions & 4 deletions components/chunk_text/tests/chunk_text_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from src.main import ChunkTextComponent


def test_transform():
def test_transform__():
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
"""Test chunk component method."""
input_dataframe = pd.DataFrame(
{
("text", "data"): [
("text_data"): [
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo",
"ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis",
"parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec,",
Expand All @@ -25,8 +25,8 @@ def test_transform():

expected_output_dataframe = pd.DataFrame(
{
("text", "original_document_id"): ["a", "a", "a", "b", "b", "c", "c"],
("text", "data"): [
("text_original_document_id"): ["a", "a", "a", "b", "b", "c", "c"],
("text_data"): [
"Lorem ipsum dolor sit amet, consectetuer",
"amet, consectetuer adipiscing elit. Aenean",
"elit. Aenean commodo",
Expand Down
24 changes: 10 additions & 14 deletions components/download_images/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,17 @@ tags:
- Image processing

consumes:
images:
fields:
url:
type: string
images_url:
type: string

produces:
images:
fields:
data:
type: binary
width:
type: int32
height:
type: int32
additionalFields: false
images_data:
type: binary
images_width:
type: int32
images_height:
type: int32
# additionalFields: false

args:
timeout:
Expand All @@ -53,7 +49,7 @@ args:
description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border".
type: str
default: 'border'
resize_only_if_bigger:
resize_only_if_bigger:
description: If True, resize only if image is bigger than image_size.
type: bool
default: False
Expand Down
5 changes: 1 addition & 4 deletions components/download_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ async def download_dataframe() -> None:
images = await asyncio.gather(
*[
self.download_and_resize_image(id_, url, semaphore=semaphore)
for id_, url in zip(dataframe.index, dataframe["images"]["url"])
for id_, url in zip(dataframe.index, dataframe["images_url"])
],
)
results.extend(images)
Expand All @@ -134,8 +134,5 @@ async def download_dataframe() -> None:

results_df = results_df.dropna()
results_df = results_df.set_index("id", drop=True)
results_df.columns = pd.MultiIndex.from_product(
[["images"], results_df.columns],
)

return results_df
8 changes: 4 additions & 4 deletions components/download_images/tests/test_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_transform(respx_mock):

input_dataframe = pd.DataFrame(
{
("images", "url"): urls,
"images_url": urls,
},
index=pd.Index(ids, name="id"),
)
Expand All @@ -55,9 +55,9 @@ def test_transform(respx_mock):
resized_images = [component.resizer(io.BytesIO(image))[0] for image in images]
expected_dataframe = pd.DataFrame(
{
("images", "data"): resized_images,
("images", "width"): [image_size] * len(ids),
("images", "height"): [image_size] * len(ids),
"images_data": resized_images,
"images_width": [image_size] * len(ids),
"images_height": [image_size] * len(ids),
},
index=pd.Index(ids, name="id"),
)
Expand Down
18 changes: 7 additions & 11 deletions components/embed_images/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,17 @@ name: Embed images
description: Component that generates CLIP embeddings from images
image: fndnt/embed_images:dev
tags:
- Image processing
- Image processing

consumes:
images:
fields:
data:
type: binary
images_data:
type: binary

produces:
embeddings:
fields:
data:
type: array
items:
type: float32
embeddings_data:
type: array
items:
type: float32

args:
model_id:
Expand Down
4 changes: 2 additions & 2 deletions components/embed_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
self.batch_size = batch_size

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["images"]["data"]
images = dataframe["images_data"]

results: t.List[pd.Series] = []
for batch in np.split(
Expand All @@ -110,4 +110,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
).T
results.append(embeddings)

return pd.concat(results).to_frame(name=("embeddings", "data"))
return pd.concat(results).to_frame(name=("embeddings_data"))
26 changes: 11 additions & 15 deletions components/embed_text/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,17 @@ tags:
- Text processing

consumes:
text:
fields:
data:
type: string
text_data:
type: string

produces:
text:
fields:
data:
type: string
embedding:
type: array
items:
type: float32

text_data:
type: string
text_embedding:
type: array
items:
type: float32

args:
model_provider:
description: |
Expand All @@ -40,12 +36,12 @@ args:
Pass only the keys required by the model provider or conveniently pass all keys you will ever need.
Pay attention how to name the dictionary keys so that they can be used by the model provider.
type: dict
default: {}
default: { }
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
auth_kwargs:
description: |
Additional keyword arguments required for api initialization/authentication.
type: dict
default: {}
default: { }



4 changes: 2 additions & 2 deletions components/embed_text/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def get_embeddings_vectors(self, texts):
return self.embedding_model.embed_documents(texts.tolist())

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
dataframe[("text", "embedding")] = self.get_embeddings_vectors(
dataframe[("text", "data")],
dataframe["text_embedding"] = self.get_embeddings_vectors(
dataframe["text_data"],
)
return dataframe
16 changes: 6 additions & 10 deletions components/embedding_based_laion_retrieval/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,14 @@ tags:
- Data retrieval

consumes:
embeddings:
fields:
data:
type: array
items:
type: float32
embeddings_data:
type: array
items:
type: float32

produces:
images:
fields:
url:
type: string
images_url:
type: string
additionalSubsets: false
mrchtr marked this conversation as resolved.
Show resolved Hide resolved

args:
Expand Down
6 changes: 3 additions & 3 deletions components/embedding_based_laion_retrieval/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,18 @@ async def async_query():
embedding_input=embedding.tolist(),
),
)
for embedding in dataframe["embeddings"]["data"]
for embedding in dataframe["embeddings_data"]
]
for response in await asyncio.gather(*futures):
results.extend(response)

loop.run_until_complete(async_query())

results_df = pd.DataFrame(results)[["id", "url"]]
results_df = pd.DataFrame(results)["id", "url"]
results_df = results_df.set_index("id")

# Cast the index to string
results_df.index = results_df.index.astype(str)
results_df.columns = [["images"], ["url"]]
results_df.columns = ["images_url"]

return results_df
10 changes: 4 additions & 6 deletions components/filter_image_resolution/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@ tags:
- Image processing

consumes:
images:
fields:
width:
type: int32
height:
type: int32
images_width:
type: int32
images_height:
type: int32

args:
min_image_dim:
Expand Down
4 changes: 2 additions & 2 deletions components/filter_image_resolution/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None:
self.max_aspect_ratio = max_aspect_ratio

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
width = dataframe["images"]["width"]
height = dataframe["images"]["height"]
width = dataframe["images_width"]
height = dataframe["images_height"]
min_image_dim = np.minimum(width, height)
max_image_dim = np.maximum(width, height)
aspect_ratio = max_image_dim / min_image_dim
Expand Down
6 changes: 2 additions & 4 deletions components/filter_text_length/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@ tags:
- Text processing

consumes:
text:
fields:
data:
type: string
text_data:
type: string

args:
min_characters_length:
Expand Down
4 changes: 2 additions & 2 deletions components/filter_text_length/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def __init__(self, *_, min_characters_length: int, min_words_length: int):

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""Filter out text based on their length."""
caption_num_words = dataframe["text"]["data"].apply(
caption_num_words = dataframe["text_data"].apply(
lambda x: len(fasttext.tokenize(x)),
)
caption_num_chars = dataframe["text"]["data"].apply(len)
caption_num_chars = dataframe["text_data"].apply(len)

mask = (caption_num_words >= self.min_words_length) & (
caption_num_chars >= self.min_characters_length
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ def test_run_component_test():
# Then: dataframe only contains one row
assert len(dataframe) == 1
assert (
dataframe.loc[2]["text"]["data"]
dataframe.loc[2]["text_data"]
== "This a valid sentence which should be still there"
)
Loading
Loading