From a9e79e90f9726e048068d754a8678ec3d6506d97 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 10 Feb 2019 20:16:10 +0100 Subject: [PATCH] Fix retry logic for download of files This fixes #235. The retry logic has been broken with commit 4bc0a94e12834b35255874da034cfe37a552be58 in version 4.2.1. --- instaloader/instaloader.py | 41 ++++++++++++++++++++++++-- instaloader/instaloadercontext.py | 49 ++++++++++++------------------- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index 15248f0..44ff66e 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -15,6 +15,9 @@ from hashlib import md5 from io import BytesIO from typing import Any, Callable, Iterator, List, Optional, Set, Union +import requests +import urllib3 + from .exceptions import * from .instaloadercontext import InstaloaderContext from .structures import Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, save_structure_to_file, load_structure_from_file @@ -47,6 +50,37 @@ def _requires_login(func: Callable) -> Callable: return call +def _retry_on_connection_error(func: Callable) -> Callable: + """Decorator to retry the function max_connection_attemps number of times. + + Herewith-decorated functions need an ``_attempt`` keyword argument. + + This is to decorate functions that do network requests that may fail. Note that + :meth:`.get_json`, :meth:`.get_iphone_json`, :meth:`.graphql_query` and :meth:`.graphql_node_list` already have + their own logic for retrying, hence functions that only use these for network access must not be decorated with this + decorator.""" + @wraps(func) + def call(instaloader, *args, **kwargs): + try: + return func(instaloader, *args, **kwargs) + except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err: + error_string = "{}({}): {}".format(func.__name__, ', '.join([repr(arg) for arg in args]), err) + if (kwargs.get('_attempt') or 1) == instaloader.context.max_connection_attempts: + raise ConnectionException(error_string) from None + instaloader.context.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False) + try: + if kwargs.get('_attempt'): + kwargs['_attempt'] += 1 + else: + kwargs['_attempt'] = 2 + instaloader.context.do_sleep() + return call(instaloader, *args, **kwargs) + except KeyboardInterrupt: + instaloader.context.error("[skipped by user]", repeat_at_end=False) + raise ConnectionException(error_string) from None + return call + + class _ArbitraryItemFormatter(string.Formatter): def __init__(self, item: Any): self._item = item @@ -175,7 +209,9 @@ class Instaloader: def __exit__(self, *args): self.close() - def download_pic(self, filename: str, url: str, mtime: datetime, filename_suffix: Optional[str] = None) -> bool: + @_retry_on_connection_error + def download_pic(self, filename: str, url: str, mtime: datetime, + filename_suffix: Optional[str] = None, _attempt: int = 1) -> bool: """Downloads and saves picture with given url under given directory with given timestamp. Returns true, if file was actually downloaded, i.e. updated.""" urlmatch = re.search('\\.[a-z0-9]*\\?', url) @@ -285,7 +321,8 @@ class Instaloader: os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) self.context.log('geo', end=' ', flush=True) - def download_profilepic(self, profile: Profile) -> None: + @_retry_on_connection_error + def download_profilepic(self, profile: Profile, _attempt: int = 1) -> None: """Downloads and saves profile pic.""" def _epoch_to_string(epoch: datetime) -> str: diff --git a/instaloader/instaloadercontext.py b/instaloader/instaloadercontext.py index d0993cd..d32e151 100644 --- a/instaloader/instaloadercontext.py +++ b/instaloader/instaloadercontext.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, Iterator, Optional, Union import requests import requests.utils -import urllib3 from .exceptions import * @@ -195,7 +194,7 @@ class InstaloaderContext: csrf_token = session.cookies.get_dict()['csrftoken'] session.headers.update({'X-CSRFToken': csrf_token}) # Not using self.get_json() here, because we need to access csrftoken cookie - self._sleep() + self.do_sleep() login = session.post('https://www.instagram.com/accounts/login/ajax/', data={'password': passwd, 'username': user}, allow_redirects=True) try: @@ -261,7 +260,7 @@ class InstaloaderContext: self.username = user self.two_factor_auth_pending = None - def _sleep(self): + def do_sleep(self): """Sleep a short time if self.sleep is set. Called before each request to instagram.com.""" if self.sleep: time.sleep(min(random.expovariate(0.7), 5.0)) @@ -304,7 +303,7 @@ class InstaloaderContext: self.query_timestamps = [time.monotonic()] sess = session if session else self._session try: - self._sleep() + self.do_sleep() resp = sess.get('https://{0}/{1}'.format(host, path), params=params, allow_redirects=False) while resp.is_redirect: redirect_url = resp.headers['location'] @@ -355,7 +354,7 @@ class InstaloaderContext: self.log('The request will be retried in {} seconds, at {:%H:%M}.' .format(waittime, datetime.now() + timedelta(seconds=waittime))) time.sleep(waittime) - self._sleep() + self.do_sleep() return self.get_json(path=path, params=params, host=host, session=sess, _attempt=_attempt + 1) except KeyboardInterrupt: self.error("[skipped by user]", repeat_at_end=False) @@ -463,34 +462,22 @@ class InstaloaderContext: :raises QueryReturnedNotFoundException: When the server responds with a 404. :raises QueryReturnedForbiddenException: When the server responds with a 403. - :raises ConnectionException: When download repeatedly failed. + :raises ConnectionException: When download failed. .. versionadded:: 4.2.1""" - try: - with self.get_anonymous_session() as anonymous_session: - resp = anonymous_session.get(url, stream=True) - if resp.status_code == 200: - resp.raw.decode_content = True - return resp - else: - if resp.status_code == 403: - # suspected invalid URL signature - raise QueryReturnedForbiddenException("403 when accessing {}.".format(url)) - if resp.status_code == 404: - # 404 not worth retrying. - raise QueryReturnedNotFoundException("404 when accessing {}.".format(url)) - raise ConnectionException("HTTP error code {}.".format(resp.status_code)) - except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err: - error_string = "URL {}: {}".format(url, err) - if _attempt == self.max_connection_attempts: - raise ConnectionException(error_string) from err - self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False) - try: - self._sleep() - return self.get_raw(url, _attempt + 1) - except KeyboardInterrupt: - self.error("[skipped by user]", repeat_at_end=False) - raise ConnectionException(error_string) from err + with self.get_anonymous_session() as anonymous_session: + resp = anonymous_session.get(url, stream=True) + if resp.status_code == 200: + resp.raw.decode_content = True + return resp + else: + if resp.status_code == 403: + # suspected invalid URL signature + raise QueryReturnedForbiddenException("403 when accessing {}.".format(url)) + if resp.status_code == 404: + # 404 not worth retrying. + raise QueryReturnedNotFoundException("404 when accessing {}.".format(url)) + raise ConnectionException("HTTP error code {}.".format(resp.status_code)) def get_and_write_raw(self, url: str, filename: str) -> None: """Downloads and writes anonymously-requested raw data into a file.