Skip to content

Commit

Permalink
[Storage][Blob][QuickQuery]Arrow Format (Azure#13750)
Browse files Browse the repository at this point in the history
* [Storage][Blob][DataLake]Quick Query Arrow Format

* fix pylint

* fix pylint

* fix pylint

* fix pylint
  • Loading branch information
xiafu-msft committed Oct 1, 2020
1 parent 3829bc3 commit b115825
Show file tree
Hide file tree
Showing 10 changed files with 632 additions and 12 deletions.
4 changes: 4 additions & 0 deletions sdk/storage/azure-storage-blob/azure/storage/blob/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
BlobQueryError,
DelimitedJsonDialect,
DelimitedTextDialect,
ArrowDialect,
ArrowType,
ObjectReplicationPolicy,
ObjectReplicationRule
)
Expand Down Expand Up @@ -219,6 +221,8 @@ def download_blob_from_url(
'BlobQueryError',
'DelimitedJsonDialect',
'DelimitedTextDialect',
'ArrowDialect',
'ArrowType',
'BlobQueryReader',
'ObjectReplicationPolicy',
'ObjectReplicationRule'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,13 +682,19 @@ def _quick_query_options(self, query_expression,
try:
delimiter = input_format.lineterminator
except AttributeError:
delimiter = input_format.delimiter
try:
delimiter = input_format.delimiter
except AttributeError:
raise ValueError("The Type of blob_format can only be DelimitedTextDialect or DelimitedJsonDialect")
output_format = kwargs.pop('output_format', None)
if output_format:
try:
delimiter = output_format.lineterminator
except AttributeError:
delimiter = output_format.delimiter
try:
delimiter = output_format.delimiter
except AttributeError:
pass
else:
output_format = input_format
query_request = QueryRequest(
Expand Down
26 changes: 25 additions & 1 deletion sdk/storage/azure-storage-blob/azure/storage/blob/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from enum import Enum

from azure.core.paging import PageIterator
from azure.storage.blob._generated.models import FilterBlobItem
from azure.storage.blob._generated.models import FilterBlobItem, ArrowField

from ._shared import decode_base64_to_text
from ._shared.response_handlers import return_context_and_deserialized, process_storage_error
Expand Down Expand Up @@ -1099,6 +1099,30 @@ def __init__(self, **kwargs):
self.has_header = kwargs.pop('has_header', False)


class ArrowDialect(ArrowField):
"""field of an arrow schema.
All required parameters must be populated in order to send to Azure.
:param ~azure.storage.blob.ArrowType type: Arrow field type.
:keyword str name: The name of the field.
:keyword int precision: The precision of the field.
:keyword int scale: The scale of the field.
"""
def __init__(self, type, **kwargs): # pylint: disable=redefined-builtin
super(ArrowDialect, self).__init__(type=type, **kwargs)


class ArrowType(str, Enum):

INT64 = "int64"
BOOL = "bool"
TIMESTAMP_MS = "timestamp[ms]"
STRING = "string"
DOUBLE = "double"
DECIMAL = 'decimal'


class ObjectReplicationPolicy(DictMixin):
"""Policy id and rule ids applied to a blob.
Expand Down
11 changes: 9 additions & 2 deletions sdk/storage/azure-storage-blob/azure/storage/blob/_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@

from ._models import (
ContainerEncryptionScope,
DelimitedJsonDialect
)
DelimitedJsonDialect)
from ._generated.models import (
ModifiedAccessConditions,
SourceModifiedAccessConditions,
Expand All @@ -24,6 +23,7 @@
QuerySerialization,
DelimitedTextConfiguration,
JsonTextConfiguration,
ArrowConfiguration,
QueryFormatType,
BlobTag,
BlobTags, LeaseAccessConditions
Expand Down Expand Up @@ -182,6 +182,13 @@ def serialize_query_format(formater):
type=QueryFormatType.delimited,
delimited_text_configuration=serialization_settings
)
elif isinstance(formater, list):
serialization_settings = ArrowConfiguration(
schema=formater
)
qq_format = QueryFormat(
type=QueryFormatType.arrow,
arrow_configuration=serialization_settings)
elif not formater:
return None
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '0'
User-Agent:
- azsdk-python-storage-blob/12.4.0 Python/3.7.3 (Windows-10-10.0.19041-SP0)
x-ms-date:
- Fri, 11 Sep 2020 20:58:27 GMT
x-ms-version:
- '2020-02-10'
method: PUT
uri: https://storagename.blob.core.windows.net/utqqcontainer9d4d1789?restype=container
response:
body:
string: ''
headers:
date:
- Fri, 11 Sep 2020 20:58:28 GMT
etag:
- '"0x8D856956EBF3C36"'
last-modified:
- Fri, 11 Sep 2020 20:58:28 GMT
transfer-encoding:
- chunked
x-ms-version:
- '2020-02-10'
status:
code: 201
message: Created
- request:
body: '100,200,300,400
300,400,500,600
'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '32'
Content-Type:
- application/octet-stream
User-Agent:
- azsdk-python-storage-blob/12.4.0 Python/3.7.3 (Windows-10-10.0.19041-SP0)
x-ms-blob-type:
- BlockBlob
x-ms-date:
- Fri, 11 Sep 2020 20:58:28 GMT
x-ms-version:
- '2020-02-10'
method: PUT
uri: https://storagename.blob.core.windows.net/utqqcontainer9d4d1789/csvfile9d4d1789
response:
body:
string: ''
headers:
content-md5:
- /hmKXD7m7tyfn12eEsFvyQ==
date:
- Fri, 11 Sep 2020 20:58:28 GMT
etag:
- '"0x8D856956ED0E86F"'
last-modified:
- Fri, 11 Sep 2020 20:58:28 GMT
transfer-encoding:
- chunked
x-ms-content-crc64:
- Dn1U+tgM/4c=
x-ms-request-server-encrypted:
- 'false'
x-ms-version:
- '2020-02-10'
status:
code: 201
message: Created
- request:
body: '<?xml version=''1.0'' encoding=''utf-8''?>
<QueryRequest><QueryType>SQL</QueryType><Expression>SELECT _2 from BlobStorage
WHERE _1 &gt; 250</Expression><OutputSerialization><Format><Type>arrow</Type><ArrowConfiguration><Schema><Field><Type>decimal</Type><Name>abc</Name><Precision>4</Precision><Scale>2</Scale></Field></Schema></ArrowConfiguration></Format></OutputSerialization></QueryRequest>'
headers:
Accept:
- application/xml
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '390'
Content-Type:
- application/xml; charset=utf-8
User-Agent:
- azsdk-python-storage-blob/12.4.0 Python/3.7.3 (Windows-10-10.0.19041-SP0)
x-ms-date:
- Fri, 11 Sep 2020 20:58:28 GMT
x-ms-version:
- '2020-02-10'
method: POST
uri: https://storagename.blob.core.windows.net/utqqcontainer9d4d1789/csvfile9d4d1789?comp=query
response:
body:
string: !!binary |
T2JqAQIWYXZyby5zY2hlbWG+HlsKICB7CiAgICAidHlwZSI6ICJyZWNvcmQiLAogICAgIm5hbWUi
OiAiY29tLm1pY3Jvc29mdC5henVyZS5zdG9yYWdlLnF1ZXJ5QmxvYkNvbnRlbnRzLnJlc3VsdERh
dGEiLAogICAgImRvYyI6ICJIb2xkcyByZXN1bHQgZGF0YSBpbiB0aGUgZm9ybWF0IHNwZWNpZmll
ZCBmb3IgdGhpcyBxdWVyeSAoQ1NWLCBKU09OLCBldGMuKS4iLAogICAgImZpZWxkcyI6IFsKICAg
ICAgewogICAgICAgICJuYW1lIjogImRhdGEiLAogICAgICAgICJ0eXBlIjogImJ5dGVzIgogICAg
ICB9CiAgICBdCiAgfSwKICB7CiAgICAidHlwZSI6ICJyZWNvcmQiLAogICAgIm5hbWUiOiAiY29t
Lm1pY3Jvc29mdC5henVyZS5zdG9yYWdlLnF1ZXJ5QmxvYkNvbnRlbnRzLmVycm9yIiwKICAgICJk
b2MiOiAiQW4gZXJyb3IgdGhhdCBvY2N1cnJlZCB3aGlsZSBwcm9jZXNzaW5nIHRoZSBxdWVyeS4i
LAogICAgImZpZWxkcyI6IFsKICAgICAgewogICAgICAgICJuYW1lIjogImZhdGFsIiwKICAgICAg
ICAidHlwZSI6ICJib29sZWFuIiwKICAgICAgICAiZG9jIjogIklmIHRydWUsIHRoaXMgZXJyb3Ig
cHJldmVudHMgZnVydGhlciBxdWVyeSBwcm9jZXNzaW5nLiAgTW9yZSByZXN1bHQgZGF0YSBtYXkg
YmUgcmV0dXJuZWQsIGJ1dCB0aGVyZSBpcyBubyBndWFyYW50ZWUgdGhhdCBhbGwgb2YgdGhlIG9y
aWdpbmFsIGRhdGEgd2lsbCBiZSBwcm9jZXNzZWQuICBJZiBmYWxzZSwgdGhpcyBlcnJvciBkb2Vz
IG5vdCBwcmV2ZW50IGZ1cnRoZXIgcXVlcnkgcHJvY2Vzc2luZy4iCiAgICAgIH0sCiAgICAgIHsK
ICAgICAgICAibmFtZSI6ICJuYW1lIiwKICAgICAgICAidHlwZSI6ICJzdHJpbmciLAogICAgICAg
ICJkb2MiOiAiVGhlIG5hbWUgb2YgdGhlIGVycm9yIgogICAgICB9LAogICAgICB7CiAgICAgICAg
Im5hbWUiOiAiZGVzY3JpcHRpb24iLAogICAgICAgICJ0eXBlIjogInN0cmluZyIsCiAgICAgICAg
ImRvYyI6ICJBIGRlc2NyaXB0aW9uIG9mIHRoZSBlcnJvciIKICAgICAgfSwKICAgICAgewogICAg
ICAgICJuYW1lIjogInBvc2l0aW9uIiwKICAgICAgICAidHlwZSI6ICJsb25nIiwKICAgICAgICAi
ZG9jIjogIlRoZSBibG9iIG9mZnNldCBhdCB3aGljaCB0aGUgZXJyb3Igb2NjdXJyZWQiCiAgICAg
IH0KICAgIF0KICB9LAogIHsKICAgICJ0eXBlIjogInJlY29yZCIsCiAgICAibmFtZSI6ICJjb20u
bWljcm9zb2Z0LmF6dXJlLnN0b3JhZ2UucXVlcnlCbG9iQ29udGVudHMucHJvZ3Jlc3MiLAogICAg
ImRvYyI6ICJJbmZvcm1hdGlvbiBhYm91dCB0aGUgcHJvZ3Jlc3Mgb2YgdGhlIHF1ZXJ5IiwKICAg
ICJmaWVsZHMiOiBbCiAgICAgIHsKICAgICAgICAibmFtZSI6ICJieXRlc1NjYW5uZWQiLAogICAg
ICAgICJ0eXBlIjogImxvbmciLAogICAgICAgICJkb2MiOiAiVGhlIG51bWJlciBvZiBieXRlcyB0
aGF0IGhhdmUgYmVlbiBzY2FubmVkIgogICAgICB9LAogICAgICB7CiAgICAgICAgIm5hbWUiOiAi
dG90YWxCeXRlcyIsCiAgICAgICAgInR5cGUiOiAibG9uZyIsCiAgICAgICAgImRvYyI6ICJUaGUg
dG90YWwgbnVtYmVyIG9mIGJ5dGVzIHRvIGJlIHNjYW5uZWQgaW4gdGhpcyBxdWVyeSIKICAgICAg
fQogICAgXQogIH0sCiAgewogICAgInR5cGUiOiAicmVjb3JkIiwKICAgICJuYW1lIjogImNvbS5t
aWNyb3NvZnQuYXp1cmUuc3RvcmFnZS5xdWVyeUJsb2JDb250ZW50cy5lbmQiLAogICAgImRvYyI6
ICJTZW50IGFzIHRoZSBmaW5hbCBtZXNzYWdlIG9mIHRoZSByZXNwb25zZSwgaW5kaWNhdGluZyB0
aGF0IGFsbCByZXN1bHRzIGhhdmUgYmVlbiBzZW50LiIsCiAgICAiZmllbGRzIjogWwogICAgICB7
CiAgICAgICAgIm5hbWUiOiAidG90YWxCeXRlcyIsCiAgICAgICAgInR5cGUiOiAibG9uZyIsCiAg
ICAgICAgImRvYyI6ICJUaGUgdG90YWwgbnVtYmVyIG9mIGJ5dGVzIHRvIGJlIHNjYW5uZWQgaW4g
dGhpcyBxdWVyeSIKICAgICAgfQogICAgXQogIH0KXQoAQmgjmNsu90Ck/YQ3d6WMowL2AwDwA///
//94AAAAEAAAAAAACgAMAAYABQAIAAoAAAAAAQMADAAAAAgACAAAAAQACAAAAAQAAAABAAAAFAAA
ABAAFAAIAAYABwAMAAAAEAAQAAAAAAABByQAAAAUAAAABAAAAAAAAAAIAAwABAAIAAgAAAAEAAAA
AgAAAAMAAABhYmMA/////3AAAAAQAAAAAAAKAA4ABgAFAAgACgAAAAADAwAQAAAAAAAKAAwAAAAE
AAgACgAAADAAAAAEAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEA
AAAAAAAAAAAAAAAAAAAAAAAAQmgjmNsu90Ck/YQ3d6WMowLGAgDAAv////+IAAAAFAAAAAAAAAAM
ABYABgAFAAgADAAMAAAAAAMDABgAAAAQAAAAAAAAAAAACgAYAAwABAAIAAoAAAA8AAAAEAAAAAEA
AAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAABAAAAAQAA
AAAAAAAAAAAAAAAAAJABAAAAAAAAAAAAAAAAAABCaCOY2y73QKT9hDd3pYyjAgYEQEBCaCOY2y73
QKT9hDd3pYyjAgQGQEJoI5jbLvdApP2EN3eljKM=
headers:
accept-ranges:
- bytes
content-type:
- avro/binary
date:
- Fri, 11 Sep 2020 20:58:28 GMT
etag:
- '"0x8D856956ED0E86F"'
last-modified:
- Fri, 11 Sep 2020 20:58:28 GMT
transfer-encoding:
- chunked
x-ms-blob-type:
- BlockBlob
x-ms-creation-time:
- Fri, 11 Sep 2020 20:58:28 GMT
x-ms-lease-state:
- available
x-ms-lease-status:
- unlocked
x-ms-version:
- '2020-02-10'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '0'
User-Agent:
- azsdk-python-storage-blob/12.4.0 Python/3.7.3 (Windows-10-10.0.19041-SP0)
x-ms-date:
- Fri, 11 Sep 2020 20:58:28 GMT
x-ms-version:
- '2020-02-10'
method: DELETE
uri: https://storagename.blob.core.windows.net/utqqcontainer9d4d1789?restype=container
response:
body:
string: ''
headers:
date:
- Fri, 11 Sep 2020 20:58:28 GMT
transfer-encoding:
- chunked
x-ms-version:
- '2020-02-10'
status:
code: 202
message: Accepted
version: 1
Loading

0 comments on commit b115825

Please sign in to comment.