Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect content encoding if invalid charset was specified #2549

Merged
merged 6 commits into from
Nov 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES/2549.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Make the `aiohttp.ClientResponse.get_encoding` method public with
the processing of invalid charset while detecting content encoding.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Ludovic Gasc
Lukasz Marcin Dobrzanski
Makc Belousow
Manuel Miranda
Marat Sharafutdinov
Marco Paolini
Mariano Anaya
Martin Melka
Expand Down
12 changes: 9 additions & 3 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import codecs
import collections
import io
import json
Expand Down Expand Up @@ -756,11 +757,16 @@ async def read(self):

return self._content

def _get_encoding(self):
def get_encoding(self):
ctype = self.headers.get(hdrs.CONTENT_TYPE, '').lower()
mimetype = helpers.parse_mimetype(ctype)

encoding = mimetype.parameters.get('charset')
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == 'application' and mimetype.subtype == 'json':
# RFC 7159 states that the default encoding is UTF-8.
Expand All @@ -778,7 +784,7 @@ async def text(self, encoding=None, errors='strict'):
await self.read()

if encoding is None:
encoding = self._get_encoding()
encoding = self.get_encoding()

return self._content.decode(encoding, errors=errors)

Expand All @@ -803,7 +809,7 @@ async def json(self, *, encoding=None, loads=json.loads,
return None

if encoding is None:
encoding = self._get_encoding()
encoding = self.get_encoding()

return loads(stripped.decode(encoding))

Expand Down
8 changes: 8 additions & 0 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,14 @@ Response object
A namedtuple with request URL and headers from :class:`ClientRequest`
object, :class:`aiohttp.RequestInfo` instance.

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`chardet` is used.

.. versionadded:: 3.0


ClientWebSocketResponse
Expand Down
30 changes: 25 additions & 5 deletions tests/test_client_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,12 @@ def side_effect(*args, **kwargs):
'Content-Type': 'application/json'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect
response._get_encoding = mock.Mock()
response.get_encoding = mock.Mock()

res = await response.text(encoding='cp1251')
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert not response._get_encoding.called
assert not response.get_encoding.called


async def test_text_detect_encoding(loop, session):
Expand All @@ -283,6 +283,26 @@ def side_effect(*args, **kwargs):
assert response._connection is None


async def test_text_detect_encoding_if_invalid_charset(loop, session):
response = ClientResponse('get', URL('http://def-cl-resp.org'))
response._post_init(loop, session)

def side_effect(*args, **kwargs):
fut = loop.create_future()
fut.set_result('{"тест": "пройден"}'.encode('cp1251'))
return fut

response.headers = {'Content-Type': 'text/plain;charset=invalid'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert response.get_encoding().lower() == 'windows-1251'


async def test_text_after_read(loop, session):
response = ClientResponse('get', URL('http://def-cl-resp.org'))
response._post_init(loop, session)
Expand Down Expand Up @@ -372,12 +392,12 @@ def side_effect(*args, **kwargs):
'Content-Type': 'application/json;charset=utf8'}
content = response.content = mock.Mock()
content.read.side_effect = side_effect
response._get_encoding = mock.Mock()
response.get_encoding = mock.Mock()

res = await response.json(encoding='cp1251')
assert res == {'тест': 'пройден'}
assert response._connection is None
assert not response._get_encoding.called
assert not response.get_encoding.called


@pytest.mark.xfail
Expand All @@ -398,7 +418,7 @@ def test_get_encoding_unknown(loop, session):
response.headers = {'Content-Type': 'application/json'}
with mock.patch('aiohttp.client_reqrep.chardet') as m_chardet:
m_chardet.detect.return_value = {'encoding': None}
assert response._get_encoding() == 'utf-8'
assert response.get_encoding() == 'utf-8'


def test_raise_for_status_2xx():
Expand Down