Skip to content

Commit

Permalink
Merge branch 'main' into large-scale-controlnet
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilippeMoussalli committed Jul 6, 2023
2 parents 400d1b7 + c180686 commit 858701e
Show file tree
Hide file tree
Showing 47 changed files with 1,129 additions and 311 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Set buildx alias
run: docker buildx install

- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set buildx alias
run: docker buildx install

- name: Build components
run: ./scripts/build_components.sh --cache -t $GITHUB_SHA -t dev
54 changes: 54 additions & 0 deletions .github/workflows/prep-release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Build Python 🐍 distributions 📦 and publish to TestPyPI
on:
push:
tags:
- '[0-9]+.[0-9]+.[0-9]+*'
jobs:
build-n-publish:
name: Build Python 🐍 distributions 📦 and publish to TestPyPI
runs-on: ubuntu-latest
permissions:
id-token: write
packages: write
steps:
- uses: actions/checkout@master

- name: Set up Python 3.9
uses: actions/setup-python@v1
with:
python-version: 3.9

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Set buildx alias
run: docker buildx install

- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build components
run: ./scripts/build_components.sh -t $GITHUB_REF_NAME

- name: Build data explorer
run: ./scripts/build_explorer.sh -t $GITHUB_REF_NAME

- name: Update version in pyproject.toml with tag version
run: sed -i "s/^version = .*/version = '${{github.ref_name}}'/" pyproject.toml

- name: Build a binary wheel and a source tarball
run: |
pip install poetry
./scripts/pre-build.sh
poetry build
- name: Publish distribution 📦 to Test PyPI
uses: pypa/[email protected]
with:
name: testpypi
repository_url: https://test.pypi.org/legacy/
url: https://test.pypi.org/p/fondant
45 changes: 13 additions & 32 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
name: Publish Python 🐍 distributions 📦 to PyPI
on:
push:
tags:
- '[0-9]+.[0-9]+.[0-9]+*'
release:
types:
- published
jobs:
build-n-publish:
name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
name: Publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
permissions:
id-token: write
Expand All @@ -21,49 +18,33 @@ jobs:
with:
python-version: 3.9

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Set buildx alias
run: docker buildx install

- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build components
if: github.event_name == 'push'
run: ./scripts/build_components.sh -t $GITHUB_REF_NAME

- name: Tag components
if: github.event_name == 'release'
run: ./scripts/tag_components.sh -o $GITHUB_REF_NAME -n latest

- name: Build data explorer
if: github.event_name == 'push'
run: ./scripts/build_explorer.sh -t $GITHUB_REF_NAME

- name: Tag data explorer
if: github.event_name == 'release'
run: ./scripts/tag_explorer.sh -o $GITHUB_REF_NAME -n latest

- name: Update version in pyproject.toml with tag version
run: sed -i "s/^version = .*/version = '${{github.ref_name}}'/" pyproject.toml

- name: Build a binary wheel and a source tarball
- name: Download distributions from test.PyPI
run: |
pip install poetry
./scripts/pre-build.sh
poetry build
- name: Publish distribution 📦 to Test PyPI
if: github.event_name == 'push'
uses: pypa/[email protected]
with:
name: testpypi
repository_url: https://test.pypi.org/legacy/
url: https://test.pypi.org/p/fondant
pip install --upgrade pip
pip download fondant==$GITHUB_REF_NAME -d dist --index-url https://test.pypi.org/simple/ --no-deps --only-binary fondant
pip download fondant==$GITHUB_REF_NAME -d dist --index-url https://test.pypi.org/simple/ --no-deps --no-binary fondant
- name: Publish distribution 📦 to PyPI if triggered by release
if: github.event_name == 'release'
uses: pypa/[email protected]
with:
name: pypi
url: https://pypi.org/p/fondant
url: https://pypi.org/p/fondant
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ And create high quality datasets to fine-tune your own foundation models.

<p align="right">(<a href="#chocolate_bar-fondant">back to top</a>)</p>

## 💨 Getting Started

Anxious to get started? Here's is a [step by step guide](https://fondant.readthedocs.io/en/latest/getting_started) to get your first pipeline up and running.

## 🪄 Example pipelines

Curious to see what Fondant can do? Have a look at our example pipelines:
Expand Down Expand Up @@ -90,6 +94,7 @@ point to create datasets for training code assistants.

<p align="right">(<a href="#chocolate_bar-fondant">back to top</a>)</p>


## 🧩 Reusable components

Fondant comes with a library of reusable components, which can jumpstart your pipeline.
Expand Down
6 changes: 3 additions & 3 deletions components/image_resolution_extraction/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ consumes:
produces:
images:
fields:
data:
type: binary
width:
type: int16
height:
type: int16
data:
type: binary
type: int16
5 changes: 3 additions & 2 deletions components/image_resolution_extraction/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""
logger.info("Filtering dataset...")

dataframe[[("images", "width"), ("images", "height")]] = \
dataframe[[("images", "data")]].map(extract_dimensions)
dataframe[[("images", "width"), ("images", "height")]] = dataframe[
[("images", "data")]
].apply(lambda x: extract_dimensions(x.images.data), axis=1)

return dataframe

Expand Down
18 changes: 18 additions & 0 deletions components/language_filter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["python", "main.py"]
7 changes: 7 additions & 0 deletions components/language_filter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Language filter

## Description
This component is based on the `TransformComponent` and is used to filter a dataframe based on language.
It allows you to remove rows that do not match the provided language, thus providing a way to focus
on specific languages within your data.

14 changes: 14 additions & 0 deletions components/language_filter/fondant_component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Filter languages
description: A component that filters text based on the language.
image: ghcr.io/ml6team/filter_language:latest

consumes:
text:
fields:
data:
type: string

args:
language:
description: A valid language code or identifier (e.g., "en", "fr", "de").
type: str
4 changes: 4 additions & 0 deletions components/language_filter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
git+https://github.com/ml6team/fondant@main
pyarrow>=7.0
gcsfs==2023.4.00
fasttext-wheel==0.9.2
Binary file added components/language_filter/src/lid.176.ftz
Binary file not shown.
72 changes: 72 additions & 0 deletions components/language_filter/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""A component that filters text based on the language."""
import logging

import fasttext
import pandas as pd

from fondant.component import PandasTransformComponent

logger = logging.getLogger(__name__)


class LanguageIdentification:
"""A class for language detection using FastText."""

def __init__(self, language, model_path: str = "lid.176.ftz"):
"""
Initializes the LanguageDetect class.
Args:
language (str): language to filter on
model_path (str): The path to the FastText language identification model.
"""
pretrained_lang_model_weight_path = model_path
self.language = language
self.model = fasttext.load_model(pretrained_lang_model_weight_path)

def predict_lang(self, text: str):
"""
Detects the language of a text sequence.
Args:
text (str): The text for language detection.
Returns:
str: The predicted language label.
"""
predictions = self.model.predict(text, k=1)
return predictions[0][0]

def is_language(self, row):
"""Predict if text of a row is written in the defined language."""
return self.language in self.predict_lang(row["text"])


class LanguageFilterComponent(PandasTransformComponent):
"""Component that filter columns based on provided language."""

def setup(self, *, language):
"""Setup language filter component.
Args:
language: Only keep text passages which are in the provided language.
"""
self.lang_detector = LanguageIdentification(language)


def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Args:
dataframe: Pandas dataframe.
Returns:
Pandas dataframe
"""
mask = dataframe.apply(self.lang_detector.is_language, axis=1)

return dataframe[mask]


if __name__ == "__main__":
component = LanguageFilterComponent.from_args()
component.run()
54 changes: 54 additions & 0 deletions components/language_filter/tests/language_filter_component_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Unit test for language filter component."""
import pandas as pd

from components.language_filter.src.main import LanguageFilterComponent
from fondant.component_spec import ComponentSpec


def test_run_component_test():
"""Test language filter component."""
# Given: Dataframe with text in different languages
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"},
{"text": "This is a sentence in English"},
{"text": "Dit is een zin in het Nederlands"}]
dataframe = pd.DataFrame(data)

# When: The language filter component proceed the dataframe
# and filter out all entries which are not written in german
spec = ComponentSpec.from_file("../fondant_component.yaml")

component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json",
output_manifest_path="./dummy_input_manifest.json",
metadata={},
user_arguments={"language": "de"},
)
component.setup(language="de")
dataframe = component.transform(dataframe=dataframe)

# Then: dataframe only contains one german row
assert len(dataframe) == 1
assert dataframe.loc[0]["text"] == "Das hier ist ein Satz in deutscher Sprache"


def test_run_component_test_filter_out_all():
"""Test language filter component."""
# Given: Dataframe with text in different languages
data = [{"text": "Das hier ist ein Satz in deutscher Sprache"},
{"text": "This is a sentence in English"},
{"text": "Dit is een zin in het Nederlands"}]
dataframe = pd.DataFrame(data)

# When: The language filter component proceed the dataframe
# and filter out all entries which are not written in french
spec = ComponentSpec.from_file("../fondant_component.yaml")

component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json",
output_manifest_path="./dummy_input_manifest.json",
metadata={},
user_arguments={"language": "fr"},
)
component.setup()
dataframe = component.transform(dataframe=dataframe)

# Then: dataframe should contain no rows anymore
assert len(dataframe) == 0
18 changes: 18 additions & 0 deletions components/text_normalization/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["python", "main.py"]
Loading

0 comments on commit 858701e

Please sign in to comment.