diff --git a/README.rst b/README.rst index ac53106..5e1ce43 100644 --- a/README.rst +++ b/README.rst @@ -49,7 +49,9 @@ directory accordingly, - allows **fine-grained customization** of filters and where to store - downloaded media. + downloaded media, + +- automatically **resumes previously-interrupted** download iterations. :: diff --git a/docs/as-module.rst b/docs/as-module.rst index 27db171..52e5b77 100644 --- a/docs/as-module.rst +++ b/docs/as-module.rst @@ -226,6 +226,19 @@ Exceptions .. autoexception:: TooManyRequestsException +Resumable Iterations +^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 4.5 + +.. autoclass:: NodeIterator + :no-show-inheritance: + +.. autoclass:: FrozenNodeIterator + :no-show-inheritance: + +.. autofunction:: resumable_iteration + ``InstaloaderContext`` (Low-level functions) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -241,4 +254,4 @@ Exceptions .. autoclass:: RateController :no-show-inheritance: - .. versionadded:: 4.5 \ No newline at end of file + .. versionadded:: 4.5 diff --git a/docs/cli-options.rst b/docs/cli-options.rst index 8f3f3ec..ec3218b 100644 --- a/docs/cli-options.rst +++ b/docs/cli-options.rst @@ -223,6 +223,29 @@ How to Download ``#hashtag`` or the profile name. Defaults to ``{date_utc}_UTC``. See :ref:`filename-specification` for a list of supported tokens. +.. option:: --resume-prefix prefix + + For many targets, Instaloader is capable of resuming a previously-aborted + download loop. To do so, it creates a JSON file within the target directory + when interrupted. This option controls the prefix for filenames that are + used to save the information to resume an interrupted download. The default + prefix is ``iterator``. + + Resuming an interrupted download is supported for most, but not all targets. + JSON files with resume information are always compressed, regardless of + :option:`--no-compress-json`. + + This feature is turned off entirely with :option:`--no-resume`. + + .. versionadded:: 4.5 + +.. option:: --no-resume + + Do not resume a previously-aborted download iteration, and do not save such + information when interrupted. + + .. versionadded:: 4.5 + .. option:: --user-agent USER_AGENT User Agent to use for HTTP requests. Per default, Instaloader pretends being diff --git a/docs/conf.py b/docs/conf.py index 5a47bfa..e66d7a1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,6 +50,8 @@ autodoc_member_order = 'bysource' intersphinx_mapping = {'python': ('https://docs.python.org/3', None), 'requests': ('https://requests.kennethreitz.org/en/master/', None)} +nitpick_ignore = [('py:class', 'typing.Tuple')] + current_release = subprocess.check_output(["git", "describe", "--abbrev=0"]).decode("ascii")[1:-1] date_format = "%e %b %Y" if platform.system() != "Windows" else "%d %b %Y" current_release_date = subprocess.check_output( diff --git a/docs/index.rst b/docs/index.rst index c8aa419..8bdbce6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,6 +40,8 @@ See :ref:`install` for more options on how to install Instaloader. - allows **fine-grained customization** of filters and where to store downloaded media, +- automatically **resumes previously-interrupted** download iterations, + - is free `open source `__ software written in Python. diff --git a/instaloader/__init__.py b/instaloader/__init__.py index b03c906..cdfef3d 100644 --- a/instaloader/__init__.py +++ b/instaloader/__init__.py @@ -15,5 +15,6 @@ else: from .exceptions import * from .instaloader import Instaloader from .instaloadercontext import InstaloaderContext, RateController +from .nodeiterator import NodeIterator, FrozenNodeIterator, resumable_iteration from .structures import (Hashtag, Highlight, Post, PostSidecarNode, PostComment, PostCommentAnswer, PostLocation, Profile, Story, StoryItem, TopSearchResults, load_structure_from_file, save_structure_to_file) diff --git a/instaloader/__main__.py b/instaloader/__main__.py index f83e76d..1d5e984 100644 --- a/instaloader/__main__.py +++ b/instaloader/__main__.py @@ -351,6 +351,13 @@ def main(): '--dirname-pattern. {profile} is replaced by the profile name,' '{target} is replaced by the target you specified, i.e. either :feed' '#hashtag or the profile name. Defaults to \'{date_utc}_UTC\'') + g_how.add_argument('--resume-prefix', metavar='PREFIX', + help='Prefix for filenames that are used to save the information to resume an interrupted ' + 'download.') + g_how.add_argument('--no-resume', action='store_true', + help='Do not resume a previously-aborted download iteration, and do not save such information ' + 'when interrupted.') + g_how.add_argument('--use-aged-resume-files', action='store_true', help=SUPPRESS) g_how.add_argument('--user-agent', help='User Agent to use for HTTP requests. Defaults to \'{}\'.'.format(default_user_agent())) g_how.add_argument('-S', '--no-sleep', action='store_true', help=SUPPRESS) @@ -394,6 +401,10 @@ def main(): raise SystemExit("--no-captions and --post-metadata-txt or --storyitem-metadata-txt given; " "That contradicts.") + if args.no_resume and args.resume_prefix: + raise SystemExit("--no-resume and --resume-prefix given; That contradicts.") + resume_prefix = (args.resume_prefix if args.resume_prefix else 'iterator') if not args.no_resume else None + if args.no_pictures and args.fast_update: raise SystemExit('--no-pictures and --fast-update cannot be used together.') @@ -412,7 +423,9 @@ def main(): post_metadata_txt_pattern=post_metadata_txt_pattern, storyitem_metadata_txt_pattern=storyitem_metadata_txt_pattern, max_connection_attempts=args.max_connection_attempts, - request_timeout=args.request_timeout) + request_timeout=args.request_timeout, + resume_prefix=resume_prefix, + check_resume_bbd=not args.use_aged_resume_files) _main(loader, args.profile, username=args.login.lower() if args.login is not None else None, diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index f0506cf..93a0da1 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -20,8 +20,9 @@ import urllib3 # type: ignore from .exceptions import * from .instaloadercontext import InstaloaderContext, RateController +from .nodeiterator import NodeIterator, resumable_iteration from .structures import (Hashtag, Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, - save_structure_to_file) + load_structure_from_file, save_structure_to_file) def get_default_session_filename(username: str) -> str: @@ -154,6 +155,8 @@ class Instaloader: :param max_connection_attempts: :option:`--max-connection-attempts` :param request_timeout: :option:`--request-timeout`, set per-request timeout (seconds) :param rate_controller: Generator for a :class:`RateController` to override rate controlling behavior + :param resume_prefix: :option:`--resume-prefix`, or None for :option:`--no-resume`. + :param check_resume_bbd: Whether to check the date of expiry of resume files and reject them if expired. .. attribute:: context @@ -177,7 +180,9 @@ class Instaloader: storyitem_metadata_txt_pattern: str = None, max_connection_attempts: int = 3, request_timeout: Optional[float] = None, - rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None): + rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None, + resume_prefix: Optional[str] = "iterator", + check_resume_bbd: bool = True): self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts, request_timeout, rate_controller) @@ -196,6 +201,8 @@ class Instaloader: else post_metadata_txt_pattern self.storyitem_metadata_txt_pattern = '' if storyitem_metadata_txt_pattern is None \ else storyitem_metadata_txt_pattern + self.resume_prefix = resume_prefix + self.check_resume_bbd = check_resume_bbd @contextmanager def anonymous_copy(self): @@ -216,7 +223,9 @@ class Instaloader: post_metadata_txt_pattern=self.post_metadata_txt_pattern, storyitem_metadata_txt_pattern=self.storyitem_metadata_txt_pattern, max_connection_attempts=self.context.max_connection_attempts, - request_timeout=self.context.request_timeout) + request_timeout=self.context.request_timeout, + resume_prefix=self.resume_prefix, + check_resume_bbd=self.check_resume_bbd) yield new_loader self.context.error_log.extend(new_loader.context.error_log) new_loader.context.error_log = [] # avoid double-printing of errors @@ -356,6 +365,24 @@ class Instaloader: os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) self.context.log('geo', end=' ', flush=True) + def format_filename_within_target_path(self, + target: Union[str, Path], + owner_profile: Optional[Profile], + identifier: str, + name_suffix: str, + extension: str): + """Returns a filename within the target path. + + .. versionadded:: 4.5""" + if ((format_string_contains_key(self.dirname_pattern, 'profile') or + format_string_contains_key(self.dirname_pattern, 'target'))): + profile_str = owner_profile.username.lower() if owner_profile is not None else target + return os.path.join(self.dirname_pattern.format(profile=profile_str, target=target), + '{0}_{1}.{2}'.format(identifier, name_suffix, extension)) + else: + return os.path.join(self.dirname_pattern.format(), + '{0}_{1}_{2}.{3}'.format(target, identifier, name_suffix, extension)) + @_retry_on_connection_error def download_title_pic(self, url: str, target: Union[str, Path], name_suffix: str, owner_profile: Optional[Profile], _attempt: int = 1) -> None: @@ -376,16 +403,7 @@ class Instaloader: else: pic_bytes = http_response.content pic_identifier = md5(pic_bytes).hexdigest()[:16] - pic_extension = 'jpg' - if ((format_string_contains_key(self.dirname_pattern, 'profile') or - format_string_contains_key(self.dirname_pattern, 'target'))): - profile_str = owner_profile.username.lower() if owner_profile is not None else target - filename = os.path.join(self.dirname_pattern.format(profile=profile_str, - target=target), - '{0}_{1}.{2}'.format(pic_identifier, name_suffix, pic_extension)) - else: - filename = os.path.join(self.dirname_pattern.format(), - '{0}_{1}_{2}.{3}'.format(target, pic_identifier, name_suffix, pic_extension)) + filename = self.format_filename_within_target_path(target, owner_profile, pic_identifier, name_suffix, 'jpg') content_length = http_response.headers.get('Content-Length', None) if os.path.isfile(filename) and (not self.context.is_logged_in or (content_length is not None and @@ -705,59 +723,75 @@ class Instaloader: fast_update: bool = False, post_filter: Optional[Callable[[Post], bool]] = None, max_count: Optional[int] = None, - total_count: Optional[int] = None) -> None: + total_count: Optional[int] = None, + owner_profile: Optional[Profile] = None) -> None: """ Download the Posts returned by given Post Iterator. ..versionadded:: 4.4 + ..versionchanged:: 4.5 + Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`. + :param posts: Post Iterator to loop through. - :param target: Target name - :param fast_update: :option:`--fast-update` - :param post_filter: :option:`--post-filter` - :param max_count: Maximum count of Posts to download (:option:`--count`) - :param total_count: Total number of posts returned by given iterator + :param target: Target name. + :param fast_update: :option:`--fast-update`. + :param post_filter: :option:`--post-filter`. + :param max_count: Maximum count of Posts to download (:option:`--count`). + :param total_count: Total number of posts returned by given iterator. + :param owner_profile: Associated profile, if any. """ - for number, post in enumerate(posts): - if max_count is not None and number >= max_count: - break - if total_count is not None: - self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, total_count, - w=len(str(total_count))), - end="", flush=True) - else: - if max_count is not None: - self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, max_count, - w=len(str(max_count))), + displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count + else total_count) + with resumable_iteration( + context=self.context, + iterator=posts, + load=load_structure_from_file, + save=save_structure_to_file, + format_path=lambda magic: self.format_filename_within_target_path( + target, owner_profile, self.resume_prefix or '', magic, 'json.xz' + ), + check_bbd=self.check_resume_bbd, + enabled=self.resume_prefix is not None + ) as resume_info: + is_resuming, start_index = resume_info + for number, post in enumerate(posts): + if max_count is not None and number + start_index >= max_count: + break + if displayed_count is not None: + self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + start_index + 1, displayed_count, + w=len(str(displayed_count))), end="", flush=True) else: - self.context.log("[{:3d}] ".format(number + 1), end="", flush=True) - if post_filter is not None: - try: - if not post_filter(post): - self.context.log("{} skipped".format(post)) - continue - except (InstaloaderException, KeyError, TypeError) as err: - self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err)) - continue - with self.context.error_catcher("Download {} of {}".format(post, target)): - # The PostChangedException gets raised if the Post's id/shortcode changed while obtaining - # additional metadata. This is most likely the case if a HTTP redirect takes place while - # resolving the shortcode URL. - # The `post_changed` variable keeps the fast-update functionality alive: A Post which is - # obained after a redirect has probably already been downloaded as a previous Post of the - # same Profile. - # Observed in issue #225: https://github.com/instaloader/instaloader/issues/225 - post_changed = False - while True: + self.context.log("[{:3d}] ".format(number + start_index + 1), end="", flush=True) + if post_filter is not None: try: - downloaded = self.download_post(post, target=target) - break - except PostChangedException: - post_changed = True + if not post_filter(post): + self.context.log("{} skipped".format(post)) + continue + except (InstaloaderException, KeyError, TypeError) as err: + self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err)) continue - if fast_update and not downloaded and not post_changed: - break + with self.context.error_catcher("Download {} of {}".format(post, target)): + # The PostChangedException gets raised if the Post's id/shortcode changed while obtaining + # additional metadata. This is most likely the case if a HTTP redirect takes place while + # resolving the shortcode URL. + # The `post_changed` variable keeps the fast-update functionality alive: A Post which is + # obained after a redirect has probably already been downloaded as a previous Post of the + # same Profile. + # Observed in issue #225: https://github.com/instaloader/instaloader/issues/225 + post_changed = False + while True: + try: + downloaded = self.download_post(post, target=target) + break + except PostChangedException: + post_changed = True + continue + if fast_update and not downloaded and not post_changed: + # disengage fast_update for first post when resuming + if not is_resuming or number > 0: + break @_requires_login def get_feed_posts(self) -> Iterator[Post]: @@ -817,8 +851,10 @@ class Instaloader: """ self.context.log("Retrieving saved posts...") assert self.context.username is not None # safe due to @_requires_login; required by typechecker - self.posts_download_loop(Profile.from_username(self.context, self.context.username).get_saved_posts(), ":saved", - fast_update, post_filter, max_count=max_count) + node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts() + self.posts_download_loop(node_iterator, ":saved", + fast_update, post_filter, + max_count=max_count, total_count=node_iterator.count) @_requires_login def get_location_posts(self, location: str) -> Iterator[Post]: @@ -873,18 +909,20 @@ class Instaloader: max_count=max_count) @_requires_login - def get_explore_posts(self) -> Iterator[Post]: + def get_explore_posts(self) -> NodeIterator[Post]: """Get Posts which are worthy of exploring suggested by Instagram. :return: Iterator over Posts of the user's suggested posts. + :rtype: NodeIterator[Post] :raises LoginRequiredException: If called without being logged in. """ - data = self.context.get_json('explore/', {}) - yield from (Post(self.context, node) - for node in self.context.graphql_node_list("df0dcc250c2b18d9fd27c5581ef33c7c", - {}, 'https://www.instagram.com/explore/', - lambda d: d['data']['user']['edge_web_discover_media'], - data.get('rhx_gis'))) + return NodeIterator( + self.context, + 'df0dcc250c2b18d9fd27c5581ef33c7c', + lambda d: d['data']['user']['edge_web_discover_media'], + lambda n: Post(self.context, n), + query_referer='https://www.instagram.com/explore/', + ) def get_hashtag_posts(self, hashtag: str) -> Iterator[Post]: """Get Posts associated with a #hashtag. @@ -955,7 +993,7 @@ class Instaloader: .. versionadded:: 4.3""" self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username)) self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter, - total_count=profile.igtvcount) + total_count=profile.igtvcount, owner_profile=profile) def _get_id_filename(self, profile_name: str) -> str: if ((format_string_contains_key(self.dirname_pattern, 'profile') or @@ -1110,7 +1148,7 @@ class Instaloader: if posts: self.context.log("Retrieving posts from profile {}.".format(profile_name)) self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter, - total_count=profile.mediacount) + total_count=profile.mediacount, owner_profile=profile) if stories and profiles: with self.context.error_catcher("Download stories"): @@ -1190,7 +1228,7 @@ class Instaloader: # Iterate over pictures and download them self.context.log("Retrieving posts from profile {}.".format(profile_name)) self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter, - total_count=profile.mediacount) + total_count=profile.mediacount, owner_profile=profile) def interactive_login(self, username: str) -> None: """Logs in and internally stores session, asking user for password interactively. diff --git a/instaloader/instaloadercontext.py b/instaloader/instaloadercontext.py index fe53466..1a66177 100644 --- a/instaloader/instaloadercontext.py +++ b/instaloader/instaloadercontext.py @@ -428,7 +428,12 @@ class InstaloaderContext: edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]], rhx_gis: Optional[str] = None, first_data: Optional[Dict[str, Any]] = None) -> Iterator[Dict[str, Any]]: - """Retrieve a list of GraphQL nodes.""" + """ + Retrieve a list of GraphQL nodes. + + ..deprecated:: 4.5 + Use :class:`NodeIterator` instead, which provides more functionality. + """ def _query(): query_variables['first'] = self._graphql_page_length diff --git a/instaloader/nodeiterator.py b/instaloader/nodeiterator.py new file mode 100644 index 0000000..7b7a872 --- /dev/null +++ b/instaloader/nodeiterator.py @@ -0,0 +1,261 @@ +import base64 +import hashlib +import json +import os +from contextlib import contextmanager +from datetime import datetime, timedelta +from lzma import LZMAError +from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar + +from .exceptions import InvalidArgumentException, QueryReturnedBadRequestException +from .instaloadercontext import InstaloaderContext + +FrozenNodeIterator = NamedTuple('FrozenNodeIterator', + [('query_hash', str), + ('query_variables', Dict), + ('query_referer', Optional[str]), + ('context_username', Optional[str]), + ('total_index', int), + ('best_before', Optional[float]), + ('remaining_data', Optional[Dict])]) +FrozenNodeIterator.__doc__ = \ + """A serializable representation of a :class:`NodeIterator` instance, saving its iteration state.""" +FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter.""" +FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter.""" +FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query.""" +FrozenNodeIterator.context_username.__doc__ = """The username who created the iterator, or ``None``.""" +FrozenNodeIterator.total_index.__doc__ = """Number of items that have already been returned.""" +FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired.""" +FrozenNodeIterator.remaining_data.__doc__ = \ + """The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing.""" + + +T = TypeVar('T') + + +class NodeIterator(Iterator[T]): + """ + Iterate the nodes within edges in a GraphQL pagination. Instances of this class are returned by many (but not all) + of Instaloader's :class:`Post`-returning functions (such as :meth:`Profile.get_posts` etc.). + + What makes this iterator special is its ability to freeze/store its current state, e.g. to interrupt an iteration, + and later thaw/resume from where it left off. + + You can freeze a NodeIterator with :meth:`NodeIterator.freeze`:: + + post_iterator = profile.get_posts() + try: + for post in post_iterator: + do_something_with(post) + except KeyboardInterrupt: + save("resume_information.json", post_iterator.freeze()) + + and later reuse it with :meth:`NodeIterator.thaw` on an equally-constructed NodeIterator:: + + post_iterator = profile.get_posts() + post_iterator.thaw(load("resume_information.json")) + + A :class:`FrozenNodeIterator` can only be thawn with a matching NodeIterator, i.e. a NodeIterator instance that has + been constructed with the same parameters as the instance that is represented by the :class:`FrozenNodeIterator` in + question. This is to ensure that an iteration cannot be resumed in a wrong, unmatching loop. As a quick way to + distinguish iterators that are saved e.g. in files, there is the :attr:`NodeIterator.magic` string: Two + NodeIterators are matching if they have the same magic. + + See also :func:`resumable_iteration` for a high-level context manager that handles a resumable iteration. + """ + + _graphql_page_length = 50 + shelf_life = timedelta(days=29) + + def __init__(self, + context: InstaloaderContext, + query_hash: str, + edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]], + node_wrapper: Callable[[Dict], T], + query_variables: Optional[Dict[str, Any]] = None, + query_referer: Optional[str] = None, + first_data: Optional[Dict[str, Any]] = None): + self._context = context + self._query_hash = query_hash + self._edge_extractor = edge_extractor + self._node_wrapper = node_wrapper + self._query_variables = query_variables if query_variables is not None else {} + self._query_referer = query_referer + self._data = first_data + self._page_index = 0 + self._total_index = 0 + self._best_before = (None if first_data is None else + datetime.now() + NodeIterator.shelf_life) + + def _query(self, after: Optional[str] = None) -> Dict: + pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any] + if after is not None: + pagination_variables['after'] = after + try: + data = self._edge_extractor( + self._context.graphql_query( + self._query_hash, {**self._query_variables, **pagination_variables}, self._query_referer + ) + ) + self._best_before = datetime.now() + NodeIterator.shelf_life + return data + except QueryReturnedBadRequestException: + new_page_length = int(NodeIterator._graphql_page_length / 2) + if new_page_length >= 12: + NodeIterator._graphql_page_length = new_page_length + self._context.error("HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.", + repeat_at_end=False) + return self._query(after) + else: + raise + + def __iter__(self): + return self + + def __next__(self): + if self._data is None: + self._data = self._query() + if self._page_index < len(self._data['edges']): + node = self._data['edges'][self._page_index]['node'] + page_index, total_index = self._page_index, self._total_index + try: + self._page_index += 1 + self._total_index += 1 + except KeyboardInterrupt: + self._page_index, self._total_index = page_index, total_index + raise + return self._node_wrapper(node) + if self._data['page_info']['has_next_page']: + query_response = self._query(self._data['page_info']['end_cursor']) + page_index, data = self._page_index, self._data + try: + self._page_index = 0 + self._data = query_response + except KeyboardInterrupt: + self._page_index, self._data = page_index, data + raise + return self.__next__() + raise StopIteration() + + @property + def count(self) -> Optional[int]: + """The ``count`` as returned by Instagram. This is not always the total count this iterator will yield.""" + return self._data.get('count') if self._data is not None else None + + @property + def total_index(self) -> int: + """Number of items that have already been returned.""" + return self._total_index + + @property + def magic(self) -> str: + """Magic string for easily identifying a matching iterator file for resuming (hash of some parameters).""" + if 'blake2b' not in hashlib.algorithms_available: + magic_hash = hashlib.new('sha224') + else: + # Use blake2b when possible, i.e. on Python >= 3.6. + magic_hash = hashlib.blake2b(digest_size=6) # type:ignore # pylint: disable=no-member + magic_hash.update(json.dumps( + [self._query_hash, self._query_variables, self._query_referer, self._context.username] + ).encode()) + return base64.urlsafe_b64encode(magic_hash.digest()).decode() + + def freeze(self) -> FrozenNodeIterator: + """Freeze the iterator for later resuming.""" + remaining_data = None + if self._data is not None: + remaining_data = {**self._data, + 'edges': (self._data['edges'][(max(self._page_index - 1, 0)):])} + return FrozenNodeIterator( + query_hash=self._query_hash, + query_variables=self._query_variables, + query_referer=self._query_referer, + context_username=self._context.username, + total_index=max(self.total_index - 1, 0), + best_before=self._best_before.timestamp() if self._best_before else None, + remaining_data=remaining_data, + ) + + def thaw(self, frozen: FrozenNodeIterator) -> None: + """Use this iterator for resuming from earlier iteration.""" + if self._total_index or self._page_index: + raise InvalidArgumentException("thaw() called on already-used iterator.") + if (self._query_hash != frozen.query_hash or + self._query_variables != frozen.query_variables or + self._query_referer != frozen.query_referer or + self._context.username != frozen.context_username): + raise InvalidArgumentException("Mismatching resume information.") + self._total_index = frozen.total_index + self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None + self._data = frozen.remaining_data + + +@contextmanager +def resumable_iteration(context: InstaloaderContext, + iterator: Iterator, + load: Callable[[InstaloaderContext, str], Any], + save: Callable[[FrozenNodeIterator, str], None], + format_path: Callable[[str], str], + check_bbd: bool = True, + enabled: bool = True) -> Iterator[Tuple[bool, int]]: + """ + High-level context manager to handle a resumable iteration that can be interrupted with a KeyboardInterrupt. + + It can be used as follows to automatically load a previously-saved state into the iterator, save the iterator's + state when interrupted, and delete the resume file upon completion:: + + post_iterator = profile.get_posts() + with resumable_iteration( + context=L.context, + iterator=post_iterator, + load=lambda _, path: FrozenNodeIterator(**json.load(open(path))), + save=lambda fni, path: json.dump(fni._asdict(), open(path, 'w')), + format_path=lambda magic: "resume_info_{}.json".format(magic) + ) as resume_info: + is_resuming, start_index = resume_info + for post in post_iterator: + do_something_with(post) + + It yields a tuple (is_resuming, start_index). + + When the passed iterator is not a :class:`NodeIterator`, it behaves as if ``resumable_iteration`` was not used, + just executing the inner body. + + :param context: The :class:`InstaloaderContext`. + :param iterator: The fresh :class:`NodeIterator`. + :param load: Loads a FrozenNodeIterator from given path. The object is ignored if it has a different type. + :param save: Saves the given FrozenNodeIterator to the given path. + :param format_path: Returns the path to the resume file for the given magic. + :param check_bbd: Whether to check the best before date and reject an expired FrozenNodeIterator. + :param enabled: Set to False to disable all functionality and simply execute the inner body. + """ + if not enabled or not isinstance(iterator, NodeIterator): + yield False, 0 + return + is_resuming = False + start_index = 0 + resume_file_path = format_path(iterator.magic) + resume_file_exists = os.path.isfile(resume_file_path) + if resume_file_exists: + try: + fni = load(context, resume_file_path) + if not isinstance(fni, FrozenNodeIterator): + raise InvalidArgumentException("Invalid type.") + if check_bbd and fni.best_before and datetime.fromtimestamp(fni.best_before) < datetime.now(): + raise InvalidArgumentException("\"Best before\" date exceeded.") + iterator.thaw(fni) + is_resuming = True + start_index = iterator.total_index + context.log("Resuming from {}.".format(resume_file_path)) + except (InvalidArgumentException, LZMAError, json.decoder.JSONDecodeError) as exc: + context.error("Warning: Not resuming from {}: {}".format(resume_file_path, exc)) + try: + yield is_resuming, start_index + except KeyboardInterrupt: + os.makedirs(os.path.dirname(resume_file_path), exist_ok=True) + save(iterator.freeze(), resume_file_path) + context.log("\nSaved resume information to {}.".format(resume_file_path)) + raise + if resume_file_exists: + os.unlink(resume_file_path) + context.log("Iteration complete, deleted resume information file {}.".format(resume_file_path)) diff --git a/instaloader/structures.py b/instaloader/structures.py index c2dda25..a0b39f3 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -9,7 +9,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union from . import __version__ from .exceptions import * from .instaloadercontext import InstaloaderContext - +from .nodeiterator import FrozenNodeIterator, NodeIterator PostSidecarNode = namedtuple('PostSidecarNode', ['is_video', 'display_url', 'video_url']) PostSidecarNode.__doc__ = "Item of a Sidecar Post." @@ -402,11 +402,14 @@ class Post: # If the answer's metadata already contains all comments, don't do GraphQL requests to obtain them yield from (_postcommentanswer(comment['node']) for comment in answer_edges) return - yield from (_postcommentanswer(answer_node) for answer_node in - self._context.graphql_node_list("51fdd02b67508306ad4484ff574a0b62", - {'comment_id': node['id']}, - 'https://www.instagram.com/p/' + self.shortcode + '/', - lambda d: d['data']['comment']['edge_threaded_comments'])) + yield from NodeIterator( + self._context, + '51fdd02b67508306ad4484ff574a0b62', + lambda d: d['data']['comment']['edge_threaded_comments'], + _postcommentanswer, + {'comment_id': node['id']}, + 'https://www.instagram.com/p/{0}/'.format(self.shortcode), + ) def _postcomment(node): return PostComment(*_postcommentanswer(node), @@ -422,12 +425,14 @@ class Post: # If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them yield from (_postcomment(comment['node']) for comment in comment_edges) return - yield from (_postcomment(node) for node in - self._context.graphql_node_list( - "97b41c52301f77ce508f55e66d17620e", - {'shortcode': self.shortcode}, - 'https://www.instagram.com/p/' + self.shortcode + '/', - lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'])) + yield from NodeIterator( + self._context, + '97b41c52301f77ce508f55e66d17620e', + lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'], + _postcomment, + {'shortcode': self.shortcode}, + 'https://www.instagram.com/p/{0}/'.format(self.shortcode), + ) def get_likes(self) -> Iterator['Profile']: """Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded.""" @@ -439,10 +444,14 @@ class Post: # If the Post's metadata already contains all likes, don't do GraphQL requests to obtain them yield from (Profile(self._context, like['node']) for like in likes_edges) return - yield from (Profile(self._context, node) for node in - self._context.graphql_node_list("1cb6ec562846122743b61e492c85999f", {'shortcode': self.shortcode}, - 'https://www.instagram.com/p/' + self.shortcode + '/', - lambda d: d['data']['shortcode_media']['edge_liked_by'])) + yield from NodeIterator( + self._context, + '1cb6ec562846122743b61e492c85999f', + lambda d: d['data']['shortcode_media']['edge_liked_by'], + lambda n: Profile(self._context, n), + {'shortcode': self.shortcode}, + 'https://www.instagram.com/p/{0}/'.format(self.shortcode), + ) @property def is_sponsored(self) -> bool: @@ -770,80 +779,110 @@ class Profile: Use :attr:`profile_pic_url`.""" return self.profile_pic_url - def get_posts(self) -> Iterator[Post]: - """Retrieve all posts from a profile.""" - self._obtain_metadata() - yield from (Post(self._context, node, self) for node in - self._context.graphql_node_list("472f257a40c653c64c666ce877d59d2b", - {'id': self.userid}, - 'https://www.instagram.com/{0}/'.format(self.username), - lambda d: d['data']['user']['edge_owner_to_timeline_media'], - first_data=self._metadata('edge_owner_to_timeline_media'))) + def get_posts(self) -> NodeIterator[Post]: + """Retrieve all posts from a profile. - def get_saved_posts(self) -> Iterator[Post]: - """Get Posts that are marked as saved by the user.""" + :rtype:NodeIterator[Post]""" + self._obtain_metadata() + return NodeIterator( + self._context, + '472f257a40c653c64c666ce877d59d2b', + lambda d: d['data']['user']['edge_owner_to_timeline_media'], + lambda n: Post(self._context, n, self), + {'id': self.userid}, + 'https://www.instagram.com/{0}/'.format(self.username), + self._metadata('edge_owner_to_timeline_media'), + ) + + def get_saved_posts(self) -> NodeIterator[Post]: + """Get Posts that are marked as saved by the user. + + :rtype:NodeIterator[Post]""" if self.username != self._context.username: raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username)) self._obtain_metadata() - yield from (Post(self._context, node) for node in - self._context.graphql_node_list("f883d95537fbcd400f466f63d42bd8a1", - {'id': self.userid}, - 'https://www.instagram.com/{0}/'.format(self.username), - lambda d: d['data']['user']['edge_saved_media'], - first_data=self._metadata('edge_saved_media'))) + return NodeIterator( + self._context, + 'f883d95537fbcd400f466f63d42bd8a1', + lambda d: d['data']['user']['edge_saved_media'], + lambda n: Post(self._context, n), + {'id': self.userid}, + 'https://www.instagram.com/{0}/'.format(self.username), + self._metadata('edge_saved_media'), + ) - def get_tagged_posts(self) -> Iterator[Post]: + def get_tagged_posts(self) -> NodeIterator[Post]: """Retrieve all posts where a profile is tagged. + :rtype: NodeIterator[Post] + .. versionadded:: 4.0.7""" self._obtain_metadata() - yield from (Post(self._context, node, self if int(node['owner']['id']) == self.userid else None) for node in - self._context.graphql_node_list("e31a871f7301132ceaab56507a66bbb7", - {'id': self.userid}, - 'https://www.instagram.com/{0}/'.format(self.username), - lambda d: d['data']['user']['edge_user_to_photos_of_you'])) + return NodeIterator( + self._context, + 'e31a871f7301132ceaab56507a66bbb7', + lambda d: d['data']['user']['edge_user_to_photos_of_you'], + lambda n: Post(self._context, n, self if int(n['owner']['id']) == self.userid else None), + {'id': self.userid}, + 'https://www.instagram.com/{0}/'.format(self.username), + ) - def get_igtv_posts(self) -> Iterator[Post]: + def get_igtv_posts(self) -> NodeIterator[Post]: """Retrieve all IGTV posts. + :rtype: NodeIterator[Post] + .. versionadded:: 4.3""" self._obtain_metadata() - yield from (Post(self._context, node, self) for node in - self._context.graphql_node_list('bc78b344a68ed16dd5d7f264681c4c76', - {'id': self.userid}, - 'https://www.instagram.com/{0}/channel/'.format(self.username), - lambda d: d['data']['user']['edge_felix_video_timeline'], - first_data=self._metadata('edge_felix_video_timeline'))) + return NodeIterator( + self._context, + 'bc78b344a68ed16dd5d7f264681c4c76', + lambda d: d['data']['user']['edge_felix_video_timeline'], + lambda n: Post(self._context, n, self), + {'id': self.userid}, + 'https://www.instagram.com/{0}/channel/'.format(self.username), + self._metadata('edge_felix_video_timeline'), + ) - def get_followers(self) -> Iterator['Profile']: + def get_followers(self) -> NodeIterator['Profile']: """ Retrieve list of followers of given profile. To use this, one needs to be logged in and private profiles has to be followed. + + :rtype:NodeIterator[Profile] """ if not self._context.is_logged_in: raise LoginRequiredException("--login required to get a profile's followers.") self._obtain_metadata() - yield from (Profile(self._context, node) for node in - self._context.graphql_node_list("37479f2b8209594dde7facb0d904896a", - {'id': str(self.userid)}, - 'https://www.instagram.com/' + self.username + '/', - lambda d: d['data']['user']['edge_followed_by'])) + return NodeIterator( + self._context, + '37479f2b8209594dde7facb0d904896a', + lambda d: d['data']['user']['edge_followed_by'], + lambda n: Profile(self._context, n), + {'id': str(self.userid)}, + 'https://www.instagram.com/{0}/'.format(self.username), + ) - def get_followees(self) -> Iterator['Profile']: + def get_followees(self) -> NodeIterator['Profile']: """ Retrieve list of followees (followings) of given profile. To use this, one needs to be logged in and private profiles has to be followed. + + :rtype:NodeIterator[Profile] """ if not self._context.is_logged_in: raise LoginRequiredException("--login required to get a profile's followees.") self._obtain_metadata() - yield from (Profile(self._context, node) for node in - self._context.graphql_node_list("58712303d941c6855d4e888c5f0cd22f", - {'id': str(self.userid)}, - 'https://www.instagram.com/' + self.username + '/', - lambda d: d['data']['user']['edge_follow'])) + return NodeIterator( + self._context, + '58712303d941c6855d4e888c5f0cd22f', + lambda d: d['data']['user']['edge_follow'], + lambda n: Profile(self._context, n), + {'id': str(self.userid)}, + 'https://www.instagram.com/{0}/'.format(self.username), + ) def get_similar_accounts(self) -> Iterator['Profile']: """ @@ -1398,7 +1437,7 @@ class TopSearchResults: return self._searchstring -JsonExportable = Union[Post, Profile, StoryItem, Hashtag] +JsonExportable = Union[Post, Profile, StoryItem, Hashtag, FrozenNodeIterator] def save_structure_to_file(structure: JsonExportable, filename: str) -> None: @@ -1447,6 +1486,8 @@ def load_structure_from_file(context: InstaloaderContext, filename: str) -> Json return StoryItem(context, json_structure['node']) elif node_type == "Hashtag": return Hashtag(context, json_structure['node']) + elif node_type == "FrozenNodeIterator": + return FrozenNodeIterator(**json_structure['node']) else: raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename)) elif 'shortcode' in json_structure: