Fix retry logic for download of files

This fixes #235. The retry logic has been broken with commit
4bc0a94e12834b35255874da034cfe37a552be58 in version 4.2.1.
This commit is contained in:
Alexander Graf 2019-02-10 20:16:10 +01:00
parent a0b7804fd2
commit a9e79e90f9
2 changed files with 57 additions and 33 deletions

View File

@ -15,6 +15,9 @@ from hashlib import md5
from io import BytesIO from io import BytesIO
from typing import Any, Callable, Iterator, List, Optional, Set, Union from typing import Any, Callable, Iterator, List, Optional, Set, Union
import requests
import urllib3
from .exceptions import * from .exceptions import *
from .instaloadercontext import InstaloaderContext from .instaloadercontext import InstaloaderContext
from .structures import Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, save_structure_to_file, load_structure_from_file from .structures import Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, save_structure_to_file, load_structure_from_file
@ -47,6 +50,37 @@ def _requires_login(func: Callable) -> Callable:
return call return call
def _retry_on_connection_error(func: Callable) -> Callable:
"""Decorator to retry the function max_connection_attemps number of times.
Herewith-decorated functions need an ``_attempt`` keyword argument.
This is to decorate functions that do network requests that may fail. Note that
:meth:`.get_json`, :meth:`.get_iphone_json`, :meth:`.graphql_query` and :meth:`.graphql_node_list` already have
their own logic for retrying, hence functions that only use these for network access must not be decorated with this
decorator."""
@wraps(func)
def call(instaloader, *args, **kwargs):
try:
return func(instaloader, *args, **kwargs)
except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err:
error_string = "{}({}): {}".format(func.__name__, ', '.join([repr(arg) for arg in args]), err)
if (kwargs.get('_attempt') or 1) == instaloader.context.max_connection_attempts:
raise ConnectionException(error_string) from None
instaloader.context.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
try:
if kwargs.get('_attempt'):
kwargs['_attempt'] += 1
else:
kwargs['_attempt'] = 2
instaloader.context.do_sleep()
return call(instaloader, *args, **kwargs)
except KeyboardInterrupt:
instaloader.context.error("[skipped by user]", repeat_at_end=False)
raise ConnectionException(error_string) from None
return call
class _ArbitraryItemFormatter(string.Formatter): class _ArbitraryItemFormatter(string.Formatter):
def __init__(self, item: Any): def __init__(self, item: Any):
self._item = item self._item = item
@ -175,7 +209,9 @@ class Instaloader:
def __exit__(self, *args): def __exit__(self, *args):
self.close() self.close()
def download_pic(self, filename: str, url: str, mtime: datetime, filename_suffix: Optional[str] = None) -> bool: @_retry_on_connection_error
def download_pic(self, filename: str, url: str, mtime: datetime,
filename_suffix: Optional[str] = None, _attempt: int = 1) -> bool:
"""Downloads and saves picture with given url under given directory with given timestamp. """Downloads and saves picture with given url under given directory with given timestamp.
Returns true, if file was actually downloaded, i.e. updated.""" Returns true, if file was actually downloaded, i.e. updated."""
urlmatch = re.search('\\.[a-z0-9]*\\?', url) urlmatch = re.search('\\.[a-z0-9]*\\?', url)
@ -285,7 +321,8 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
self.context.log('geo', end=' ', flush=True) self.context.log('geo', end=' ', flush=True)
def download_profilepic(self, profile: Profile) -> None: @_retry_on_connection_error
def download_profilepic(self, profile: Profile, _attempt: int = 1) -> None:
"""Downloads and saves profile pic.""" """Downloads and saves profile pic."""
def _epoch_to_string(epoch: datetime) -> str: def _epoch_to_string(epoch: datetime) -> str:

View File

@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, Iterator, Optional, Union
import requests import requests
import requests.utils import requests.utils
import urllib3
from .exceptions import * from .exceptions import *
@ -195,7 +194,7 @@ class InstaloaderContext:
csrf_token = session.cookies.get_dict()['csrftoken'] csrf_token = session.cookies.get_dict()['csrftoken']
session.headers.update({'X-CSRFToken': csrf_token}) session.headers.update({'X-CSRFToken': csrf_token})
# Not using self.get_json() here, because we need to access csrftoken cookie # Not using self.get_json() here, because we need to access csrftoken cookie
self._sleep() self.do_sleep()
login = session.post('https://www.instagram.com/accounts/login/ajax/', login = session.post('https://www.instagram.com/accounts/login/ajax/',
data={'password': passwd, 'username': user}, allow_redirects=True) data={'password': passwd, 'username': user}, allow_redirects=True)
try: try:
@ -261,7 +260,7 @@ class InstaloaderContext:
self.username = user self.username = user
self.two_factor_auth_pending = None self.two_factor_auth_pending = None
def _sleep(self): def do_sleep(self):
"""Sleep a short time if self.sleep is set. Called before each request to instagram.com.""" """Sleep a short time if self.sleep is set. Called before each request to instagram.com."""
if self.sleep: if self.sleep:
time.sleep(min(random.expovariate(0.7), 5.0)) time.sleep(min(random.expovariate(0.7), 5.0))
@ -304,7 +303,7 @@ class InstaloaderContext:
self.query_timestamps = [time.monotonic()] self.query_timestamps = [time.monotonic()]
sess = session if session else self._session sess = session if session else self._session
try: try:
self._sleep() self.do_sleep()
resp = sess.get('https://{0}/{1}'.format(host, path), params=params, allow_redirects=False) resp = sess.get('https://{0}/{1}'.format(host, path), params=params, allow_redirects=False)
while resp.is_redirect: while resp.is_redirect:
redirect_url = resp.headers['location'] redirect_url = resp.headers['location']
@ -355,7 +354,7 @@ class InstaloaderContext:
self.log('The request will be retried in {} seconds, at {:%H:%M}.' self.log('The request will be retried in {} seconds, at {:%H:%M}.'
.format(waittime, datetime.now() + timedelta(seconds=waittime))) .format(waittime, datetime.now() + timedelta(seconds=waittime)))
time.sleep(waittime) time.sleep(waittime)
self._sleep() self.do_sleep()
return self.get_json(path=path, params=params, host=host, session=sess, _attempt=_attempt + 1) return self.get_json(path=path, params=params, host=host, session=sess, _attempt=_attempt + 1)
except KeyboardInterrupt: except KeyboardInterrupt:
self.error("[skipped by user]", repeat_at_end=False) self.error("[skipped by user]", repeat_at_end=False)
@ -463,34 +462,22 @@ class InstaloaderContext:
:raises QueryReturnedNotFoundException: When the server responds with a 404. :raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises QueryReturnedForbiddenException: When the server responds with a 403. :raises QueryReturnedForbiddenException: When the server responds with a 403.
:raises ConnectionException: When download repeatedly failed. :raises ConnectionException: When download failed.
.. versionadded:: 4.2.1""" .. versionadded:: 4.2.1"""
try: with self.get_anonymous_session() as anonymous_session:
with self.get_anonymous_session() as anonymous_session: resp = anonymous_session.get(url, stream=True)
resp = anonymous_session.get(url, stream=True) if resp.status_code == 200:
if resp.status_code == 200: resp.raw.decode_content = True
resp.raw.decode_content = True return resp
return resp else:
else: if resp.status_code == 403:
if resp.status_code == 403: # suspected invalid URL signature
# suspected invalid URL signature raise QueryReturnedForbiddenException("403 when accessing {}.".format(url))
raise QueryReturnedForbiddenException("403 when accessing {}.".format(url)) if resp.status_code == 404:
if resp.status_code == 404: # 404 not worth retrying.
# 404 not worth retrying. raise QueryReturnedNotFoundException("404 when accessing {}.".format(url))
raise QueryReturnedNotFoundException("404 when accessing {}.".format(url)) raise ConnectionException("HTTP error code {}.".format(resp.status_code))
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err:
error_string = "URL {}: {}".format(url, err)
if _attempt == self.max_connection_attempts:
raise ConnectionException(error_string) from err
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
try:
self._sleep()
return self.get_raw(url, _attempt + 1)
except KeyboardInterrupt:
self.error("[skipped by user]", repeat_at_end=False)
raise ConnectionException(error_string) from err
def get_and_write_raw(self, url: str, filename: str) -> None: def get_and_write_raw(self, url: str, filename: str) -> None:
"""Downloads and writes anonymously-requested raw data into a file. """Downloads and writes anonymously-requested raw data into a file.