Skip to content

Commit

Permalink
v.0.2.6 (#12)
Browse files Browse the repository at this point in the history
Change inspection default printing. And added --detail option to thrift printing.
  • Loading branch information
ktrueda authored Jun 30, 2021
1 parent 321d72e commit dc9df28
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 7 deletions.
1 change: 1 addition & 0 deletions .github/workflows/cli_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ jobs:
- name: Run inspect
run: |
parquet-tools inspect ./tests/test1.parquet
parquet-tools inspect --detail ./tests/test1.parquet
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"--ignore=E402,E501,E722,E252,W504,W605"
],
"python.linting.mypyEnabled": true,
"python.linting.mypyPath": ".venv/bin/mypy"
"python.linting.mypyPath": ".venv/bin/mypy",
"editor.defaultFormatter": "ms-python.python"
}
65 changes: 60 additions & 5 deletions parquet_tools/commands/inspect.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import sys
from argparse import ArgumentParser, Namespace
from typing import List

import pyarrow.parquet as pq
from colorama import Fore, Style

from parquet_tools.parquet.reader import get_filemetadata

from .utils import FileNotFoundException, ParquetFile, to_parquet_file
Expand All @@ -23,6 +24,11 @@ def configure_parser(paser: ArgumentParser) -> ArgumentParser:
type=str,
required=False,
help='awscli profile in ~/.aws/credentials. You use this option when you read parquet file on s3.')
paser.add_argument('--detail',
action='store_true',
required=False,
default=False,
help='Detail expression using thrift.')

paser.set_defaults(handler=_cli)
return paser
Expand All @@ -35,14 +41,63 @@ def _cli(args: Namespace) -> None:
else:
try:
with pf.get_local_path() as local_path:
_execute(
filename=local_path,
)
if args.detail:
_execute_detail(
filename=local_path,
)
else:
_execute_simple(
filename=local_path,
)
except FileNotFoundException as e:
print(str(e), file=sys.stderr)


def _execute(filename: str) -> None:
def _execute_simple(filename: str) -> None:
pq_file: pq.ParquetFile = pq.ParquetFile(filename)
file_meta: pq.FileMetaData = pq_file.metadata
print(_simple_meta_expression(file_meta))
file_schema: pq.ParquetSchema = pq_file.schema
print(_simple_schema_expression(file_schema))


def _simple_meta_expression(file_meta: pq.FileMetaData) -> str:
return dedent(f'''
############ file meta data ############
created_by: {file_meta.created_by}
num_columns: {file_meta.num_columns}
num_rows: {file_meta.num_rows}
num_row_groups: {file_meta.num_row_groups}
format_version: {file_meta.format_version}
serialized_size: {file_meta.serialized_size}
''')


def _simple_schema_expression(schema) -> str:
columns: List[str] = schema.names
columns_exp = '\n'.join(columns)

exp = dedent(f'''
############ Columns ############
{columns_exp}
''')
for i, column in enumerate(columns):
col = schema.column(i)
exp += dedent(f'''
############ Column({column}) ############
name: {col.name}
path: {col.path}
max_definition_level: {col.max_definition_level}
max_repetition_level: {col.max_repetition_level}
physical_type: {col.physical_type}
logical_type: {col.logical_type}
converted_type (legacy): {col.converted_type}
''')

return exp


def _execute_detail(filename: str) -> None:
print(_obj_to_string(get_filemetadata(filename)))


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "parquet_tools"
version = "0.2.5"
version = "0.2.6"
description = "Easy install parquet-tools"
authors = ["Kentaro Ueda <[email protected]>"]
license = "MIT"
Expand Down
79 changes: 79 additions & 0 deletions tests/test_inspect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import pytest
from parquet_tools.commands.inspect import _execute_detail, _execute_simple
import pyarrow as pa
import pandas as pd
from tempfile import TemporaryDirectory
import numpy as np


@pytest.fixture
def parquet_file():
df = pd.DataFrame(
{'one': [-1, np.nan, 2.5],
'two': ['foo', 'bar', 'baz'],
'three': [True, False, True]}
)
table = pa.Table.from_pandas(df)
with TemporaryDirectory() as tmp_path:
pq_path = f'{tmp_path}/test.pq'
pa.parquet.write_table(table, pq_path)
yield pq_path


def test_excute_detail(parquet_file):
_execute_detail(
parquet_file
)
# not raise error


def test_excute_simple(capfd, parquet_file):
_execute_simple(
parquet_file
)
out, err = capfd.readouterr()

assert err == ''
assert out == '''
############ file meta data ############
created_by: parquet-cpp version 1.5.1-SNAPSHOT
num_columns: 3
num_rows: 3
num_row_groups: 1
format_version: 1.0
serialized_size: 2226
############ Columns ############
one
two
three
############ Column(one) ############
name: one
path: one
max_definition_level: 1
max_repetition_level: 0
physical_type: DOUBLE
logical_type: None
converted_type (legacy): NONE
############ Column(two) ############
name: two
path: two
max_definition_level: 1
max_repetition_level: 0
physical_type: BYTE_ARRAY
logical_type: String
converted_type (legacy): UTF8
############ Column(three) ############
name: three
path: three
max_definition_level: 1
max_repetition_level: 0
physical_type: BOOLEAN
logical_type: None
converted_type (legacy): NONE
'''

0 comments on commit dc9df28

Please sign in to comment.