Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core] remove chardet dep from azure.core.rest #19962

Merged
merged 8 commits into from
Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sdk/core/azure-core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

- Cut hard dependency on requests library

### Breaking Changes
### Breaking Changes in the Provisional `azure.core.rest` package

- `azure.core.rest` will not try to guess the `charset` anymore if it was impossible to extract it from `HttpResponse` analysis. This removes our dependency on `charset`.

### Key Bugs Fixed

Expand Down
24 changes: 9 additions & 15 deletions sdk/core/azure-core/azure/core/rest/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@
from urlparse import urlparse # type: ignore
except ImportError:
from urllib.parse import urlparse
try:
import cchardet as chardet
except ImportError: # pragma: no cover
import chardet # type: ignore
from ..exceptions import ResponseNotReadError

################################### TYPES SECTION #########################

Expand Down Expand Up @@ -285,22 +280,21 @@ def from_pipeline_transport_request_helper(request_class, pipeline_transport_req
)

def get_charset_encoding(response):
# type: (...) -> Optional[str]
content_type = response.headers.get("Content-Type")

if not content_type:
return None
_, params = cgi.parse_header(content_type)
encoding = params.get('charset') # -> utf-8
if encoding is None:
if content_type in ("application/json", "application/rdap+json"):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
encoding = "utf-8"
else:
try:
encoding = chardet.detect(response.content)["encoding"]
except ResponseNotReadError:
pass
if encoding is None or not lookup_encoding(encoding):
return None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want to default to "utf-8"("utf-8-sig"), right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After talking with @lmazuel, we don't default the value of self.encoding to utf-8-sig. We do that when we're deserializing for the text property, which I've added code for in def decode_to_text

return encoding

def decode_to_text(encoding, content):
# type: (Optional[str], bytes) -> str
if encoding == "utf-8":
encoding = "utf-8-sig"
if encoding:
return content.decode(encoding)
return codecs.getincrementaldecoder("utf-8-sig")(errors="replace").decode(content)
24 changes: 17 additions & 7 deletions sdk/core/azure-core/azure/core/rest/_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
to_pipeline_transport_request_helper,
from_pipeline_transport_request_helper,
get_charset_encoding,
decode_to_text,
)
from ..exceptions import ResponseNotReadError
if TYPE_CHECKING:
Expand Down Expand Up @@ -205,6 +206,7 @@ def __init__(self, **kwargs):
self._json = None # this is filled in ContentDecodePolicy, when we deserialize
self._connection_data_block_size = None # type: Optional[int]
self._content = None # type: Optional[bytes]
self._text = None # type: Optional[str]

@property
def url(self):
Expand All @@ -215,13 +217,18 @@ def url(self):
@property
def encoding(self):
# type: (...) -> Optional[str]
"""Returns the response encoding. By default, is specified
by the response Content-Type header.
"""Returns the response encoding.

:return: The response encoding. We either return the encoding set by the user,
or try extracting the encoding from the response's content type. If all fails,
we return `None`.
:rtype: optional[str]
"""
try:
return self._encoding
except AttributeError:
return get_charset_encoding(self)
self._encoding = get_charset_encoding(self) # type: Optional[str]
return self._encoding

@encoding.setter
def encoding(self, value):
Expand All @@ -233,10 +240,13 @@ def encoding(self, value):
def text(self):
# type: (...) -> str
"""Returns the response body as a string"""
encoding = self.encoding
if encoding == "utf-8" or encoding is None:
encoding = "utf-8-sig"
return self.content.decode(encoding)
if self._text is None:
content = self.content
if not content:
self._text = ""
else:
self._text = decode_to_text(self.encoding, self.content)
return self._text

def json(self):
# type: (...) -> Any
Expand Down
26 changes: 18 additions & 8 deletions sdk/core/azure-core/azure/core/rest/_rest_py3.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
format_parameters,
to_pipeline_transport_request_helper,
from_pipeline_transport_request_helper,
get_charset_encoding
get_charset_encoding,
decode_to_text,
)
from ._helpers_py3 import set_content_body
from ..exceptions import ResponseNotReadError
Expand Down Expand Up @@ -235,6 +236,7 @@ def __init__(
self._connection_data_block_size = None
self._json = None # this is filled in ContentDecodePolicy, when we deserialize
self._content = None # type: Optional[bytes]
self._text = None # type: Optional[str]

@property
def url(self) -> str:
Expand All @@ -243,13 +245,18 @@ def url(self) -> str:

@property
def encoding(self) -> Optional[str]:
"""Returns the response encoding. By default, is specified
by the response Content-Type header.
"""Returns the response encoding.

:return: The response encoding. We either return the encoding set by the user,
or try extracting the encoding from the response's content type. If all fails,
we return `None`.
:rtype: optional[str]
"""
try:
return self._encoding
except AttributeError:
return get_charset_encoding(self)
self._encoding: Optional[str] = get_charset_encoding(self)
return self._encoding

@encoding.setter
def encoding(self, value: str) -> None:
Expand All @@ -259,10 +266,13 @@ def encoding(self, value: str) -> None:
@property
def text(self) -> str:
"""Returns the response body as a string"""
encoding = self.encoding
if encoding == "utf-8" or encoding is None:
encoding = "utf-8-sig"
return self.content.decode(encoding)
if self._text is None:
content = self.content
if not content:
self._text = ""
else:
self._text = decode_to_text(self.encoding, self.content)
return self._text

def json(self) -> Any:
"""Returns the whole body as a json object.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ async def test_response_no_charset_with_ascii_content(send_request):

assert response.headers["Content-Type"] == "text/plain"
assert response.status_code == 200
assert response.encoding == 'ascii'
assert response.encoding is None
content = await response.read()
assert content == b"Hello, world!"
assert response.text == "Hello, world!"
Expand All @@ -165,8 +165,8 @@ async def test_response_no_charset_with_iso_8859_1_content(send_request):
request=HttpRequest("GET", "/encoding/iso-8859-1"),
)
await response.read()
assert response.text == u"Accented: Österreich"
assert response.encoding == 'ISO-8859-1'
assert response.text == "Accented: �sterreich" # aiohttp is having diff behavior than requests
assert response.encoding is None

# NOTE: aiohttp isn't liking this
# @pytest.mark.asyncio
Expand All @@ -187,7 +187,7 @@ async def test_json(send_request):
)
await response.read()
assert response.json() == {"greeting": "hello", "recipient": "world"}
assert response.encoding == 'utf-8'
assert response.encoding is None

@pytest.mark.asyncio
async def test_json_with_specified_encoding(send_request):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_response_no_charset_with_ascii_content(send_request):

assert response.headers["Content-Type"] == "text/plain"
assert response.status_code == 200
assert response.encoding == 'ascii'
assert response.encoding is None
assert response.text == "Hello, world!"


Expand All @@ -151,7 +151,7 @@ def test_response_no_charset_with_iso_8859_1_content(send_request):
request=HttpRequest("GET", "/encoding/iso-8859-1"),
)
assert response.text == u"Accented: Österreich"
assert response.encoding == 'ISO-8859-1'
assert response.encoding is None

def test_response_set_explicit_encoding(send_request):
# Deliberately incorrect charset
Expand All @@ -168,7 +168,7 @@ def test_json(send_request):
request=HttpRequest("GET", "/basic/json"),
)
assert response.json() == {"greeting": "hello", "recipient": "world"}
assert response.encoding == 'utf-8-sig' # for requests, we use utf-8-sig instead of utf-8 bc of requests behavior
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need these three tests? Maybe one is good enought?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite sure which tests you mean by the three tests, but I feel like more tests is better, so might as well not get rid of them

assert response.encoding is None

def test_json_with_specified_encoding(send_request):
response = send_request(
Expand Down