Fix retry logic for download of files

This fixes #235. The retry logic has been broken with commit
4bc0a94e12 in version 4.2.1.
This commit is contained in:
Alexander Graf 2019-02-10 20:16:10 +01:00
parent a0b7804fd2
commit a9e79e90f9
2 changed files with 57 additions and 33 deletions

View File

@ -15,6 +15,9 @@ from hashlib import md5
from io import BytesIO
from typing import Any, Callable, Iterator, List, Optional, Set, Union
import requests
import urllib3
from .exceptions import *
from .instaloadercontext import InstaloaderContext
from .structures import Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, save_structure_to_file, load_structure_from_file
@ -47,6 +50,37 @@ def _requires_login(func: Callable) -> Callable:
return call
def _retry_on_connection_error(func: Callable) -> Callable:
"""Decorator to retry the function max_connection_attemps number of times.
Herewith-decorated functions need an ``_attempt`` keyword argument.
This is to decorate functions that do network requests that may fail. Note that
:meth:`.get_json`, :meth:`.get_iphone_json`, :meth:`.graphql_query` and :meth:`.graphql_node_list` already have
their own logic for retrying, hence functions that only use these for network access must not be decorated with this
decorator."""
@wraps(func)
def call(instaloader, *args, **kwargs):
try:
return func(instaloader, *args, **kwargs)
except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err:
error_string = "{}({}): {}".format(func.__name__, ', '.join([repr(arg) for arg in args]), err)
if (kwargs.get('_attempt') or 1) == instaloader.context.max_connection_attempts:
raise ConnectionException(error_string) from None
instaloader.context.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
try:
if kwargs.get('_attempt'):
kwargs['_attempt'] += 1
else:
kwargs['_attempt'] = 2
instaloader.context.do_sleep()
return call(instaloader, *args, **kwargs)
except KeyboardInterrupt:
instaloader.context.error("[skipped by user]", repeat_at_end=False)
raise ConnectionException(error_string) from None
return call
class _ArbitraryItemFormatter(string.Formatter):
def __init__(self, item: Any):
self._item = item
@ -175,7 +209,9 @@ class Instaloader:
def __exit__(self, *args):
self.close()
def download_pic(self, filename: str, url: str, mtime: datetime, filename_suffix: Optional[str] = None) -> bool:
@_retry_on_connection_error
def download_pic(self, filename: str, url: str, mtime: datetime,
filename_suffix: Optional[str] = None, _attempt: int = 1) -> bool:
"""Downloads and saves picture with given url under given directory with given timestamp.
Returns true, if file was actually downloaded, i.e. updated."""
urlmatch = re.search('\\.[a-z0-9]*\\?', url)
@ -285,7 +321,8 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
self.context.log('geo', end=' ', flush=True)
def download_profilepic(self, profile: Profile) -> None:
@_retry_on_connection_error
def download_profilepic(self, profile: Profile, _attempt: int = 1) -> None:
"""Downloads and saves profile pic."""
def _epoch_to_string(epoch: datetime) -> str:

View File

@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, Iterator, Optional, Union
import requests
import requests.utils
import urllib3
from .exceptions import *
@ -195,7 +194,7 @@ class InstaloaderContext:
csrf_token = session.cookies.get_dict()['csrftoken']
session.headers.update({'X-CSRFToken': csrf_token})
# Not using self.get_json() here, because we need to access csrftoken cookie
self._sleep()
self.do_sleep()
login = session.post('https://www.instagram.com/accounts/login/ajax/',
data={'password': passwd, 'username': user}, allow_redirects=True)
try:
@ -261,7 +260,7 @@ class InstaloaderContext:
self.username = user
self.two_factor_auth_pending = None
def _sleep(self):
def do_sleep(self):
"""Sleep a short time if self.sleep is set. Called before each request to instagram.com."""
if self.sleep:
time.sleep(min(random.expovariate(0.7), 5.0))
@ -304,7 +303,7 @@ class InstaloaderContext:
self.query_timestamps = [time.monotonic()]
sess = session if session else self._session
try:
self._sleep()
self.do_sleep()
resp = sess.get('https://{0}/{1}'.format(host, path), params=params, allow_redirects=False)
while resp.is_redirect:
redirect_url = resp.headers['location']
@ -355,7 +354,7 @@ class InstaloaderContext:
self.log('The request will be retried in {} seconds, at {:%H:%M}.'
.format(waittime, datetime.now() + timedelta(seconds=waittime)))
time.sleep(waittime)
self._sleep()
self.do_sleep()
return self.get_json(path=path, params=params, host=host, session=sess, _attempt=_attempt + 1)
except KeyboardInterrupt:
self.error("[skipped by user]", repeat_at_end=False)
@ -463,34 +462,22 @@ class InstaloaderContext:
:raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises QueryReturnedForbiddenException: When the server responds with a 403.
:raises ConnectionException: When download repeatedly failed.
:raises ConnectionException: When download failed.
.. versionadded:: 4.2.1"""
try:
with self.get_anonymous_session() as anonymous_session:
resp = anonymous_session.get(url, stream=True)
if resp.status_code == 200:
resp.raw.decode_content = True
return resp
else:
if resp.status_code == 403:
# suspected invalid URL signature
raise QueryReturnedForbiddenException("403 when accessing {}.".format(url))
if resp.status_code == 404:
# 404 not worth retrying.
raise QueryReturnedNotFoundException("404 when accessing {}.".format(url))
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err:
error_string = "URL {}: {}".format(url, err)
if _attempt == self.max_connection_attempts:
raise ConnectionException(error_string) from err
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
try:
self._sleep()
return self.get_raw(url, _attempt + 1)
except KeyboardInterrupt:
self.error("[skipped by user]", repeat_at_end=False)
raise ConnectionException(error_string) from err
with self.get_anonymous_session() as anonymous_session:
resp = anonymous_session.get(url, stream=True)
if resp.status_code == 200:
resp.raw.decode_content = True
return resp
else:
if resp.status_code == 403:
# suspected invalid URL signature
raise QueryReturnedForbiddenException("403 when accessing {}.".format(url))
if resp.status_code == 404:
# 404 not worth retrying.
raise QueryReturnedNotFoundException("404 when accessing {}.".format(url))
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
def get_and_write_raw(self, url: str, filename: str) -> None:
"""Downloads and writes anonymously-requested raw data into a file.