diff --git a/.github/workflows/cli_test.yml b/.github/workflows/cli_test.yml index 5881664..8bda452 100644 --- a/.github/workflows/cli_test.yml +++ b/.github/workflows/cli_test.yml @@ -44,3 +44,4 @@ jobs: - name: Run inspect run: | parquet-tools inspect ./tests/test1.parquet + parquet-tools inspect --detail ./tests/test1.parquet diff --git a/.vscode/settings.json b/.vscode/settings.json index b2e8583..9bc91e7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,5 +14,6 @@ "--ignore=E402,E501,E722,E252,W504,W605" ], "python.linting.mypyEnabled": true, - "python.linting.mypyPath": ".venv/bin/mypy" + "python.linting.mypyPath": ".venv/bin/mypy", + "editor.defaultFormatter": "ms-python.python" } diff --git a/parquet_tools/commands/inspect.py b/parquet_tools/commands/inspect.py index f7852ce..620e3b0 100644 --- a/parquet_tools/commands/inspect.py +++ b/parquet_tools/commands/inspect.py @@ -1,8 +1,9 @@ import sys from argparse import ArgumentParser, Namespace +from typing import List +import pyarrow.parquet as pq from colorama import Fore, Style - from parquet_tools.parquet.reader import get_filemetadata from .utils import FileNotFoundException, ParquetFile, to_parquet_file @@ -23,6 +24,11 @@ def configure_parser(paser: ArgumentParser) -> ArgumentParser: type=str, required=False, help='awscli profile in ~/.aws/credentials. You use this option when you read parquet file on s3.') + paser.add_argument('--detail', + action='store_true', + required=False, + default=False, + help='Detail expression using thrift.') paser.set_defaults(handler=_cli) return paser @@ -35,14 +41,63 @@ def _cli(args: Namespace) -> None: else: try: with pf.get_local_path() as local_path: - _execute( - filename=local_path, - ) + if args.detail: + _execute_detail( + filename=local_path, + ) + else: + _execute_simple( + filename=local_path, + ) except FileNotFoundException as e: print(str(e), file=sys.stderr) -def _execute(filename: str) -> None: +def _execute_simple(filename: str) -> None: + pq_file: pq.ParquetFile = pq.ParquetFile(filename) + file_meta: pq.FileMetaData = pq_file.metadata + print(_simple_meta_expression(file_meta)) + file_schema: pq.ParquetSchema = pq_file.schema + print(_simple_schema_expression(file_schema)) + + +def _simple_meta_expression(file_meta: pq.FileMetaData) -> str: + return dedent(f''' + ############ file meta data ############ + created_by: {file_meta.created_by} + num_columns: {file_meta.num_columns} + num_rows: {file_meta.num_rows} + num_row_groups: {file_meta.num_row_groups} + format_version: {file_meta.format_version} + serialized_size: {file_meta.serialized_size} + ''') + + +def _simple_schema_expression(schema) -> str: + columns: List[str] = schema.names + columns_exp = '\n'.join(columns) + + exp = dedent(f''' + ############ Columns ############ + {columns_exp} + ''') + for i, column in enumerate(columns): + col = schema.column(i) + exp += dedent(f''' + ############ Column({column}) ############ + name: {col.name} + path: {col.path} + max_definition_level: {col.max_definition_level} + max_repetition_level: {col.max_repetition_level} + physical_type: {col.physical_type} + logical_type: {col.logical_type} + converted_type (legacy): {col.converted_type} + ''') + + return exp + + +def _execute_detail(filename: str) -> None: print(_obj_to_string(get_filemetadata(filename))) diff --git a/pyproject.toml b/pyproject.toml index fec0ba9..ec027a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "parquet_tools" -version = "0.2.5" +version = "0.2.6" description = "Easy install parquet-tools" authors = ["Kentaro Ueda "] license = "MIT" diff --git a/tests/test_inspect.py b/tests/test_inspect.py new file mode 100644 index 0000000..d331dc4 --- /dev/null +++ b/tests/test_inspect.py @@ -0,0 +1,79 @@ +import pytest +from parquet_tools.commands.inspect import _execute_detail, _execute_simple +import pyarrow as pa +import pandas as pd +from tempfile import TemporaryDirectory +import numpy as np + + +@pytest.fixture +def parquet_file(): + df = pd.DataFrame( + {'one': [-1, np.nan, 2.5], + 'two': ['foo', 'bar', 'baz'], + 'three': [True, False, True]} + ) + table = pa.Table.from_pandas(df) + with TemporaryDirectory() as tmp_path: + pq_path = f'{tmp_path}/test.pq' + pa.parquet.write_table(table, pq_path) + yield pq_path + + +def test_excute_detail(parquet_file): + _execute_detail( + parquet_file + ) + # not raise error + + +def test_excute_simple(capfd, parquet_file): + _execute_simple( + parquet_file + ) + out, err = capfd.readouterr() + + assert err == '' + assert out == ''' +############ file meta data ############ +created_by: parquet-cpp version 1.5.1-SNAPSHOT +num_columns: 3 +num_rows: 3 +num_row_groups: 1 +format_version: 1.0 +serialized_size: 2226 + + +############ Columns ############ +one +two +three + +############ Column(one) ############ +name: one +path: one +max_definition_level: 1 +max_repetition_level: 0 +physical_type: DOUBLE +logical_type: None +converted_type (legacy): NONE + +############ Column(two) ############ +name: two +path: two +max_definition_level: 1 +max_repetition_level: 0 +physical_type: BYTE_ARRAY +logical_type: String +converted_type (legacy): UTF8 + +############ Column(three) ############ +name: three +path: three +max_definition_level: 1 +max_repetition_level: 0 +physical_type: BOOLEAN +logical_type: None +converted_type (legacy): NONE + +'''