-
Notifications
You must be signed in to change notification settings - Fork 134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Html2parquet example #804
base: dev
Are you sure you want to change the base?
Html2parquet example #804
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "c4f9c952-cb3b-40f1-bfb5-00d9a43a5715", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture\n", | ||
"!pip install data-prep-toolkit==0.2.2.dev2\n", | ||
"!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n", | ||
"!pip install pandas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "20663a67-5aa1-4b61-b989-94201613e41f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from data_processing.runtime.pure_python import PythonTransformLauncher\n", | ||
"from data_processing.utils import ParamsUtils\n", | ||
"\n", | ||
"from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "e75f6922-eb0f-4164-a536-f96393e04604", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import ast\n", | ||
"\n", | ||
"# create parameters\n", | ||
"local_conf = {\n", | ||
" \"input_folder\": \"input\",\n", | ||
" \"output_folder\": \"output\",\n", | ||
"}\n", | ||
"\n", | ||
"params = {\n", | ||
" # Data access. Only required parameters are specified\n", | ||
" \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", | ||
" \"data_files_to_use\": ast.literal_eval(\"['.html']\"),\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "data_files_to_use": ast.literal_eval("['.zip', '.html']"), support zip as well |
||
"}\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "4d2354db-1bb3-4a71-98df-f0f148af3a02", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"17:09:40 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n", | ||
"17:09:40 INFO - pipeline id pipeline_id\n", | ||
"17:09:40 INFO - code location None\n", | ||
"17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", | ||
"17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n", | ||
"17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n", | ||
"17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n", | ||
"17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n", | ||
"17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n", | ||
"17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n", | ||
"17:09:47 INFO - done flushing in 0.0 sec\n", | ||
"17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"\n", | ||
"import sys\n", | ||
"sys.argv = ParamsUtils.dict_to_req(d=(params))\n", | ||
"# create launcher\n", | ||
"launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n", | ||
"# launch\n", | ||
"return_code = launcher.launch()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "e2bee8da-c566-4e45-bca1-354dfd04b0df", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>title</th>\n", | ||
" <th>document</th>\n", | ||
" <th>contents</th>\n", | ||
" <th>document_id</th>\n", | ||
" <th>size</th>\n", | ||
" <th>date_acquired</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>ai-alliance-index.html</td>\n", | ||
" <td>ai-alliance-index.html</td>\n", | ||
" <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n", | ||
" <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n", | ||
" <td>394</td>\n", | ||
" <td>2024-11-13T17:09:40.947095</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" title document \\\n", | ||
"0 ai-alliance-index.html ai-alliance-index.html \n", | ||
"\n", | ||
" contents \\\n", | ||
"0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n", | ||
"\n", | ||
" document_id size \\\n", | ||
"0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n", | ||
"\n", | ||
" date_acquired \n", | ||
"0 2024-11-13T17:09:40.947095 " | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import pyarrow.parquet as pq\n", | ||
"import pandas as pd\n", | ||
"table = pq.read_table('output/ai-alliance-index.parquet')\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. table = pq.read_table('/path/to/your/output/folder/sample.parquet') users don't have "ai-alliance-index.parquet" so making it general. |
||
"table.to_pandas()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "cde6e37d-c437-490f-8e01-f4f51a123484", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"table.to_pandas()['contents'][0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/path/to/your/input/folder
/path/to/your/output/folder
^to make sure it's the folder not files.