Skip to content

Commit

Permalink
add dataframe example (NVIDIA#137)
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Jul 23, 2024
1 parent 7bd69cb commit 4fec0f6
Showing 1 changed file with 79 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"from nemo_curator import DomainClassifier, QualityClassifier, get_client\n",
=======
"from dask_cuda import LocalCUDACluster\n",
"from dask.distributed import Client\n",
"from nemo_curator import DomainClassifier, QualityClassifier\n",
>>>>>>> 19692e0 (add dataframe example (#137))
"from nemo_curator.datasets import DocumentDataset\n",
"import cudf\n",
"import dask_cudf"
Expand Down Expand Up @@ -124,9 +130,50 @@
"outputs": [],
"source": [
"if classifier_type == \"DomainClassifier\":\n",
<<<<<<< HEAD
" classifier = DomainClassifier(batch_size=1024)\n",
"\n",
"elif classifier_type == \"QualityClassifier\":\n",
=======
" domain_labels = [\n",
" \"Adult\",\n",
" \"Arts_and_Entertainment\",\n",
" \"Autos_and_Vehicles\",\n",
" \"Beauty_and_Fitness\",\n",
" \"Books_and_Literature\",\n",
" \"Business_and_Industrial\",\n",
" \"Computers_and_Electronics\",\n",
" \"Finance\",\n",
" \"Food_and_Drink\",\n",
" \"Games\",\n",
" \"Health\",\n",
" \"Hobbies_and_Leisure\",\n",
" \"Home_and_Garden\",\n",
" \"Internet_and_Telecom\",\n",
" \"Jobs_and_Education\",\n",
" \"Law_and_Government\",\n",
" \"News\",\n",
" \"Online_Communities\",\n",
" \"People_and_Society\",\n",
" \"Pets_and_Animals\",\n",
" \"Real_Estate\",\n",
" \"Science\",\n",
" \"Sensitive_Subjects\",\n",
" \"Shopping\",\n",
" \"Sports\",\n",
" \"Travel_and_Transportation\",\n",
" ]\n",
"\n",
" classifier = DomainClassifier(\n",
" model_path=domain_model_path,\n",
" labels=domain_labels,\n",
" batch_size=1024,\n",
" )\n",
"\n",
"elif classifier_type == \"QualityClassifier\":\n",
" quality_labels = [\"High\", \"Medium\", \"Low\"]\n",
"\n",
>>>>>>> 19692e0 (add dataframe example (#137))
" classifier = QualityClassifier(\n",
" model_path=quality_model_path,\n",
" batch_size=1024,\n",
Expand Down Expand Up @@ -161,23 +208,36 @@
"name": "stderr",
"output_type": "stream",
"text": [
<<<<<<< HEAD
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]"
=======
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.62it/s]"
>>>>>>> 19692e0 (add dataframe example (#137))
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing to disk complete for 1 partitions\n",
<<<<<<< HEAD
"CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n",
"Wall time: 12.7 s\n"
=======
"CPU times: user 578 ms, sys: 429 ms, total: 1.01 s\n",
"Wall time: 9.91 s\n"
>>>>>>> 19692e0 (add dataframe example (#137))
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
<<<<<<< HEAD
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.07it/s]\n"
=======
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.30it/s]\n"
>>>>>>> 19692e0 (add dataframe example (#137))
]
}
],
Expand Down Expand Up @@ -286,6 +346,25 @@
"source": [
"output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
"output_dataset.df.head()"
<<<<<<< HEAD
=======
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove the Output File(s)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"!rm -rf $output_file_path"
>>>>>>> 19692e0 (add dataframe example (#137))
]
}
],
Expand Down

0 comments on commit 4fec0f6

Please sign in to comment.