-
Notifications
You must be signed in to change notification settings - Fork 272
/
requests_fetcher.py
141 lines (114 loc) · 4.98 KB
/
requests_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright 2021, New York University and the TUF contributors
# SPDX-License-Identifier: MIT OR Apache-2.0
"""Provides an implementation of FetcherInterface using the Requests HTTP
library.
"""
import logging
from typing import Dict, Iterator
from urllib import parse
# Imports
import requests
import requests.exceptions
import tuf
from tuf.api import exceptions
from tuf.ngclient.fetcher import FetcherInterface
# Globals
logger = logging.getLogger(__name__)
# Classes
class RequestsFetcher(FetcherInterface):
"""A concrete implementation of FetcherInterface based on the Requests
library.
Attributes:
_sessions: A dictionary of Requests.Session objects storing a separate
session per scheme+hostname combination.
"""
def __init__(self) -> None:
# http://docs.python-requests.org/en/master/user/advanced/#session-objects:
#
# "The Session object allows you to persist certain parameters across
# requests. It also persists cookies across all requests made from the
# Session instance, and will use urllib3's connection pooling. So if
# you're making several requests to the same host, the underlying TCP
# connection will be reused, which can result in a significant
# performance increase (see HTTP persistent connection)."
#
# NOTE: We use a separate requests.Session per scheme+hostname
# combination, in order to reuse connections to the same hostname to
# improve efficiency, but avoiding sharing state between different
# hosts-scheme combinations to minimize subtle security issues.
# Some cookies may not be HTTP-safe.
self._sessions: Dict[str, requests.Session] = {}
# Default settings
self.socket_timeout: int = 4 # seconds
self.chunk_size: int = 400000 # bytes
def fetch(self, url: str) -> Iterator[bytes]:
"""Fetches the contents of HTTP/HTTPS url from a remote server
Arguments:
url: A URL string that represents a file location.
Raises:
exceptions.SlowRetrievalError: A timeout occurs while receiving
data.
exceptions.FetcherHTTPError: An HTTP error code is received.
exceptions.DownloadError: When there is a problem parsing the url.
Returns:
A bytes iterator
"""
# Get a customized session for each new schema+hostname combination.
session = self._get_session(url)
# Get the requests.Response object for this URL.
#
# Defer downloading the response body with stream=True.
# Always set the timeout. This timeout value is interpreted by
# requests as:
# - connect timeout (max delay before first byte is received)
# - read (gap) timeout (max delay between bytes received)
try:
response = session.get(
url, stream=True, timeout=self.socket_timeout
)
except requests.exceptions.Timeout as e:
raise exceptions.SlowRetrievalError from e
# Check response status.
try:
response.raise_for_status()
except requests.HTTPError as e:
response.close()
status = e.response.status_code
raise exceptions.FetcherHTTPError(str(e), status)
return self._chunks(response)
def _chunks(self, response: "requests.Response") -> Iterator[bytes]:
"""A generator function to be returned by fetch. This way the
caller of fetch can differentiate between connection and actual data
download."""
try:
for data in response.iter_content(self.chunk_size):
yield data
except (
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
) as e:
raise exceptions.SlowRetrievalError from e
finally:
response.close()
def _get_session(self, url: str) -> requests.Session:
"""Returns a different customized requests.Session per schema+hostname
combination.
Raises:
exceptions.DownloadError: When there is a problem parsing the url.
"""
# Use a different requests.Session per schema+hostname combination, to
# reuse connections while minimizing subtle security issues.
parsed_url = parse.urlparse(url)
if not parsed_url.scheme or not parsed_url.hostname:
raise exceptions.DownloadError("Failed to parse URL {url}")
session_index = f"{parsed_url.scheme}+{parsed_url.hostname}"
session = self._sessions.get(session_index)
if not session:
session = requests.Session()
self._sessions[session_index] = session
ua = f"tuf/{tuf.__version__} {session.headers['User-Agent']}"
session.headers["User-Agent"] = ua
logger.debug("Made new session %s", session_index)
else:
logger.debug("Reusing session %s", session_index)
return session