forked from ml6team/fondant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.py
79 lines (64 loc) · 2.14 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Pipeline used to create the dataset to train the StarCoder model."""
import logging
import sys
sys.path.append("../")
from pipeline_configs import PipelineConfigs
from fondant.compiler import DockerCompiler
from fondant.pipeline import ComponentOp, Pipeline
logger = logging.getLogger(__name__)
dataset_column_name = [
"content",
"lang",
"size",
"path",
"repository_name",
"avg_line_length",
"max_line_length",
"alphanum_fraction",
]
load_component_column_mapping = {
column: f"code_{column}" for column in dataset_column_name
}
# Initialize pipeline and client
pipeline = Pipeline(
pipeline_name="Stack filtering pipeline",
pipeline_description="A pipeline for filtering the stack dataset",
base_path=PipelineConfigs.BASE_PATH,
)
# define ops
load_from_hub_op = ComponentOp(
component_dir="components/load_from_hub",
arguments={
"dataset_name": "ml6team/the-stack-smol-python",
"column_name_mapping": load_component_column_mapping,
"n_rows_to_load": None,
},
)
filter_line_length_op = ComponentOp.from_registry(
name="filter_line_length",
arguments={
"avg_line_length_threshold": 10,
"max_line_length_threshold": 100,
"alphanum_fraction_threshold": 0.25,
},
)
filter_comments_op = ComponentOp.from_registry(
name="filter_comments",
arguments={"min_comments_ratio": 0.1, "max_comments_ratio": 0.9},
)
pii_redaction_op = ComponentOp.from_registry(
name="pii_redaction",
)
# add ops to pipeline
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_line_length_op, dependencies=load_from_hub_op)
pipeline.add_op(filter_comments_op, dependencies=filter_line_length_op)
pipeline.add_op(pii_redaction_op, dependencies=filter_comments_op)
if __name__ == "__main__":
compiler = DockerCompiler()
# mount the gcloud credentials to the container
extra_volumes = [
"$HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json:ro"
]
compiler.compile(pipeline=pipeline, extra_volumes=extra_volumes)
logger.info("Run `docker compose up` to run the pipeline.")