Resume a previously-aborted post download loop (#732)
With this change, Instaloader is capable of resuming a previously-aborted download loop. To do so, it creates a JSON file within the target directory when interrupted, that contains all the necessary information to later resume that operation. Resuming an interrupted download is supported for most, but not all targets. It is supported for: - Regular profile posts, - IGTV posts - Saved posts, - Tagged posts, - Explore posts.
This commit is contained in:
parent
c817d1901a
commit
bc40b82f94
@ -49,7 +49,9 @@
|
||||
directory accordingly,
|
||||
|
||||
- allows **fine-grained customization** of filters and where to store
|
||||
downloaded media.
|
||||
downloaded media,
|
||||
|
||||
- automatically **resumes previously-interrupted** download iterations.
|
||||
|
||||
::
|
||||
|
||||
|
@ -226,6 +226,19 @@ Exceptions
|
||||
|
||||
.. autoexception:: TooManyRequestsException
|
||||
|
||||
Resumable Iterations
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. versionadded:: 4.5
|
||||
|
||||
.. autoclass:: NodeIterator
|
||||
:no-show-inheritance:
|
||||
|
||||
.. autoclass:: FrozenNodeIterator
|
||||
:no-show-inheritance:
|
||||
|
||||
.. autofunction:: resumable_iteration
|
||||
|
||||
``InstaloaderContext`` (Low-level functions)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@ -241,4 +254,4 @@ Exceptions
|
||||
.. autoclass:: RateController
|
||||
:no-show-inheritance:
|
||||
|
||||
.. versionadded:: 4.5
|
||||
.. versionadded:: 4.5
|
||||
|
@ -223,6 +223,29 @@ How to Download
|
||||
``#hashtag`` or the profile name. Defaults to ``{date_utc}_UTC``.
|
||||
See :ref:`filename-specification` for a list of supported tokens.
|
||||
|
||||
.. option:: --resume-prefix prefix
|
||||
|
||||
For many targets, Instaloader is capable of resuming a previously-aborted
|
||||
download loop. To do so, it creates a JSON file within the target directory
|
||||
when interrupted. This option controls the prefix for filenames that are
|
||||
used to save the information to resume an interrupted download. The default
|
||||
prefix is ``iterator``.
|
||||
|
||||
Resuming an interrupted download is supported for most, but not all targets.
|
||||
JSON files with resume information are always compressed, regardless of
|
||||
:option:`--no-compress-json`.
|
||||
|
||||
This feature is turned off entirely with :option:`--no-resume`.
|
||||
|
||||
.. versionadded:: 4.5
|
||||
|
||||
.. option:: --no-resume
|
||||
|
||||
Do not resume a previously-aborted download iteration, and do not save such
|
||||
information when interrupted.
|
||||
|
||||
.. versionadded:: 4.5
|
||||
|
||||
.. option:: --user-agent USER_AGENT
|
||||
|
||||
User Agent to use for HTTP requests. Per default, Instaloader pretends being
|
||||
|
@ -50,6 +50,8 @@ autodoc_member_order = 'bysource'
|
||||
intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
|
||||
'requests': ('https://requests.kennethreitz.org/en/master/', None)}
|
||||
|
||||
nitpick_ignore = [('py:class', 'typing.Tuple')]
|
||||
|
||||
current_release = subprocess.check_output(["git", "describe", "--abbrev=0"]).decode("ascii")[1:-1]
|
||||
date_format = "%e %b %Y" if platform.system() != "Windows" else "%d %b %Y"
|
||||
current_release_date = subprocess.check_output(
|
||||
|
@ -40,6 +40,8 @@ See :ref:`install` for more options on how to install Instaloader.
|
||||
- allows **fine-grained customization** of filters and where to store
|
||||
downloaded media,
|
||||
|
||||
- automatically **resumes previously-interrupted** download iterations,
|
||||
|
||||
- is free `open source <https://github.com/instaloader/instaloader>`__
|
||||
software written in Python.
|
||||
|
||||
|
@ -15,5 +15,6 @@ else:
|
||||
from .exceptions import *
|
||||
from .instaloader import Instaloader
|
||||
from .instaloadercontext import InstaloaderContext, RateController
|
||||
from .nodeiterator import NodeIterator, FrozenNodeIterator, resumable_iteration
|
||||
from .structures import (Hashtag, Highlight, Post, PostSidecarNode, PostComment, PostCommentAnswer, PostLocation,
|
||||
Profile, Story, StoryItem, TopSearchResults, load_structure_from_file, save_structure_to_file)
|
||||
|
@ -351,6 +351,13 @@ def main():
|
||||
'--dirname-pattern. {profile} is replaced by the profile name,'
|
||||
'{target} is replaced by the target you specified, i.e. either :feed'
|
||||
'#hashtag or the profile name. Defaults to \'{date_utc}_UTC\'')
|
||||
g_how.add_argument('--resume-prefix', metavar='PREFIX',
|
||||
help='Prefix for filenames that are used to save the information to resume an interrupted '
|
||||
'download.')
|
||||
g_how.add_argument('--no-resume', action='store_true',
|
||||
help='Do not resume a previously-aborted download iteration, and do not save such information '
|
||||
'when interrupted.')
|
||||
g_how.add_argument('--use-aged-resume-files', action='store_true', help=SUPPRESS)
|
||||
g_how.add_argument('--user-agent',
|
||||
help='User Agent to use for HTTP requests. Defaults to \'{}\'.'.format(default_user_agent()))
|
||||
g_how.add_argument('-S', '--no-sleep', action='store_true', help=SUPPRESS)
|
||||
@ -394,6 +401,10 @@ def main():
|
||||
raise SystemExit("--no-captions and --post-metadata-txt or --storyitem-metadata-txt given; "
|
||||
"That contradicts.")
|
||||
|
||||
if args.no_resume and args.resume_prefix:
|
||||
raise SystemExit("--no-resume and --resume-prefix given; That contradicts.")
|
||||
resume_prefix = (args.resume_prefix if args.resume_prefix else 'iterator') if not args.no_resume else None
|
||||
|
||||
if args.no_pictures and args.fast_update:
|
||||
raise SystemExit('--no-pictures and --fast-update cannot be used together.')
|
||||
|
||||
@ -412,7 +423,9 @@ def main():
|
||||
post_metadata_txt_pattern=post_metadata_txt_pattern,
|
||||
storyitem_metadata_txt_pattern=storyitem_metadata_txt_pattern,
|
||||
max_connection_attempts=args.max_connection_attempts,
|
||||
request_timeout=args.request_timeout)
|
||||
request_timeout=args.request_timeout,
|
||||
resume_prefix=resume_prefix,
|
||||
check_resume_bbd=not args.use_aged_resume_files)
|
||||
_main(loader,
|
||||
args.profile,
|
||||
username=args.login.lower() if args.login is not None else None,
|
||||
|
@ -20,8 +20,9 @@ import urllib3 # type: ignore
|
||||
|
||||
from .exceptions import *
|
||||
from .instaloadercontext import InstaloaderContext, RateController
|
||||
from .nodeiterator import NodeIterator, resumable_iteration
|
||||
from .structures import (Hashtag, Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem,
|
||||
save_structure_to_file)
|
||||
load_structure_from_file, save_structure_to_file)
|
||||
|
||||
|
||||
def get_default_session_filename(username: str) -> str:
|
||||
@ -154,6 +155,8 @@ class Instaloader:
|
||||
:param max_connection_attempts: :option:`--max-connection-attempts`
|
||||
:param request_timeout: :option:`--request-timeout`, set per-request timeout (seconds)
|
||||
:param rate_controller: Generator for a :class:`RateController` to override rate controlling behavior
|
||||
:param resume_prefix: :option:`--resume-prefix`, or None for :option:`--no-resume`.
|
||||
:param check_resume_bbd: Whether to check the date of expiry of resume files and reject them if expired.
|
||||
|
||||
.. attribute:: context
|
||||
|
||||
@ -177,7 +180,9 @@ class Instaloader:
|
||||
storyitem_metadata_txt_pattern: str = None,
|
||||
max_connection_attempts: int = 3,
|
||||
request_timeout: Optional[float] = None,
|
||||
rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None):
|
||||
rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None,
|
||||
resume_prefix: Optional[str] = "iterator",
|
||||
check_resume_bbd: bool = True):
|
||||
|
||||
self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts,
|
||||
request_timeout, rate_controller)
|
||||
@ -196,6 +201,8 @@ class Instaloader:
|
||||
else post_metadata_txt_pattern
|
||||
self.storyitem_metadata_txt_pattern = '' if storyitem_metadata_txt_pattern is None \
|
||||
else storyitem_metadata_txt_pattern
|
||||
self.resume_prefix = resume_prefix
|
||||
self.check_resume_bbd = check_resume_bbd
|
||||
|
||||
@contextmanager
|
||||
def anonymous_copy(self):
|
||||
@ -216,7 +223,9 @@ class Instaloader:
|
||||
post_metadata_txt_pattern=self.post_metadata_txt_pattern,
|
||||
storyitem_metadata_txt_pattern=self.storyitem_metadata_txt_pattern,
|
||||
max_connection_attempts=self.context.max_connection_attempts,
|
||||
request_timeout=self.context.request_timeout)
|
||||
request_timeout=self.context.request_timeout,
|
||||
resume_prefix=self.resume_prefix,
|
||||
check_resume_bbd=self.check_resume_bbd)
|
||||
yield new_loader
|
||||
self.context.error_log.extend(new_loader.context.error_log)
|
||||
new_loader.context.error_log = [] # avoid double-printing of errors
|
||||
@ -356,6 +365,24 @@ class Instaloader:
|
||||
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
|
||||
self.context.log('geo', end=' ', flush=True)
|
||||
|
||||
def format_filename_within_target_path(self,
|
||||
target: Union[str, Path],
|
||||
owner_profile: Optional[Profile],
|
||||
identifier: str,
|
||||
name_suffix: str,
|
||||
extension: str):
|
||||
"""Returns a filename within the target path.
|
||||
|
||||
.. versionadded:: 4.5"""
|
||||
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
||||
format_string_contains_key(self.dirname_pattern, 'target'))):
|
||||
profile_str = owner_profile.username.lower() if owner_profile is not None else target
|
||||
return os.path.join(self.dirname_pattern.format(profile=profile_str, target=target),
|
||||
'{0}_{1}.{2}'.format(identifier, name_suffix, extension))
|
||||
else:
|
||||
return os.path.join(self.dirname_pattern.format(),
|
||||
'{0}_{1}_{2}.{3}'.format(target, identifier, name_suffix, extension))
|
||||
|
||||
@_retry_on_connection_error
|
||||
def download_title_pic(self, url: str, target: Union[str, Path], name_suffix: str, owner_profile: Optional[Profile],
|
||||
_attempt: int = 1) -> None:
|
||||
@ -376,16 +403,7 @@ class Instaloader:
|
||||
else:
|
||||
pic_bytes = http_response.content
|
||||
pic_identifier = md5(pic_bytes).hexdigest()[:16]
|
||||
pic_extension = 'jpg'
|
||||
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
||||
format_string_contains_key(self.dirname_pattern, 'target'))):
|
||||
profile_str = owner_profile.username.lower() if owner_profile is not None else target
|
||||
filename = os.path.join(self.dirname_pattern.format(profile=profile_str,
|
||||
target=target),
|
||||
'{0}_{1}.{2}'.format(pic_identifier, name_suffix, pic_extension))
|
||||
else:
|
||||
filename = os.path.join(self.dirname_pattern.format(),
|
||||
'{0}_{1}_{2}.{3}'.format(target, pic_identifier, name_suffix, pic_extension))
|
||||
filename = self.format_filename_within_target_path(target, owner_profile, pic_identifier, name_suffix, 'jpg')
|
||||
content_length = http_response.headers.get('Content-Length', None)
|
||||
if os.path.isfile(filename) and (not self.context.is_logged_in or
|
||||
(content_length is not None and
|
||||
@ -705,59 +723,75 @@ class Instaloader:
|
||||
fast_update: bool = False,
|
||||
post_filter: Optional[Callable[[Post], bool]] = None,
|
||||
max_count: Optional[int] = None,
|
||||
total_count: Optional[int] = None) -> None:
|
||||
total_count: Optional[int] = None,
|
||||
owner_profile: Optional[Profile] = None) -> None:
|
||||
"""
|
||||
Download the Posts returned by given Post Iterator.
|
||||
|
||||
..versionadded:: 4.4
|
||||
|
||||
..versionchanged:: 4.5
|
||||
Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
|
||||
|
||||
:param posts: Post Iterator to loop through.
|
||||
:param target: Target name
|
||||
:param fast_update: :option:`--fast-update`
|
||||
:param post_filter: :option:`--post-filter`
|
||||
:param max_count: Maximum count of Posts to download (:option:`--count`)
|
||||
:param total_count: Total number of posts returned by given iterator
|
||||
:param target: Target name.
|
||||
:param fast_update: :option:`--fast-update`.
|
||||
:param post_filter: :option:`--post-filter`.
|
||||
:param max_count: Maximum count of Posts to download (:option:`--count`).
|
||||
:param total_count: Total number of posts returned by given iterator.
|
||||
:param owner_profile: Associated profile, if any.
|
||||
"""
|
||||
for number, post in enumerate(posts):
|
||||
if max_count is not None and number >= max_count:
|
||||
break
|
||||
if total_count is not None:
|
||||
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, total_count,
|
||||
w=len(str(total_count))),
|
||||
end="", flush=True)
|
||||
else:
|
||||
if max_count is not None:
|
||||
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, max_count,
|
||||
w=len(str(max_count))),
|
||||
displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
|
||||
else total_count)
|
||||
with resumable_iteration(
|
||||
context=self.context,
|
||||
iterator=posts,
|
||||
load=load_structure_from_file,
|
||||
save=save_structure_to_file,
|
||||
format_path=lambda magic: self.format_filename_within_target_path(
|
||||
target, owner_profile, self.resume_prefix or '', magic, 'json.xz'
|
||||
),
|
||||
check_bbd=self.check_resume_bbd,
|
||||
enabled=self.resume_prefix is not None
|
||||
) as resume_info:
|
||||
is_resuming, start_index = resume_info
|
||||
for number, post in enumerate(posts):
|
||||
if max_count is not None and number + start_index >= max_count:
|
||||
break
|
||||
if displayed_count is not None:
|
||||
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + start_index + 1, displayed_count,
|
||||
w=len(str(displayed_count))),
|
||||
end="", flush=True)
|
||||
else:
|
||||
self.context.log("[{:3d}] ".format(number + 1), end="", flush=True)
|
||||
if post_filter is not None:
|
||||
try:
|
||||
if not post_filter(post):
|
||||
self.context.log("{} skipped".format(post))
|
||||
continue
|
||||
except (InstaloaderException, KeyError, TypeError) as err:
|
||||
self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err))
|
||||
continue
|
||||
with self.context.error_catcher("Download {} of {}".format(post, target)):
|
||||
# The PostChangedException gets raised if the Post's id/shortcode changed while obtaining
|
||||
# additional metadata. This is most likely the case if a HTTP redirect takes place while
|
||||
# resolving the shortcode URL.
|
||||
# The `post_changed` variable keeps the fast-update functionality alive: A Post which is
|
||||
# obained after a redirect has probably already been downloaded as a previous Post of the
|
||||
# same Profile.
|
||||
# Observed in issue #225: https://github.com/instaloader/instaloader/issues/225
|
||||
post_changed = False
|
||||
while True:
|
||||
self.context.log("[{:3d}] ".format(number + start_index + 1), end="", flush=True)
|
||||
if post_filter is not None:
|
||||
try:
|
||||
downloaded = self.download_post(post, target=target)
|
||||
break
|
||||
except PostChangedException:
|
||||
post_changed = True
|
||||
if not post_filter(post):
|
||||
self.context.log("{} skipped".format(post))
|
||||
continue
|
||||
except (InstaloaderException, KeyError, TypeError) as err:
|
||||
self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err))
|
||||
continue
|
||||
if fast_update and not downloaded and not post_changed:
|
||||
break
|
||||
with self.context.error_catcher("Download {} of {}".format(post, target)):
|
||||
# The PostChangedException gets raised if the Post's id/shortcode changed while obtaining
|
||||
# additional metadata. This is most likely the case if a HTTP redirect takes place while
|
||||
# resolving the shortcode URL.
|
||||
# The `post_changed` variable keeps the fast-update functionality alive: A Post which is
|
||||
# obained after a redirect has probably already been downloaded as a previous Post of the
|
||||
# same Profile.
|
||||
# Observed in issue #225: https://github.com/instaloader/instaloader/issues/225
|
||||
post_changed = False
|
||||
while True:
|
||||
try:
|
||||
downloaded = self.download_post(post, target=target)
|
||||
break
|
||||
except PostChangedException:
|
||||
post_changed = True
|
||||
continue
|
||||
if fast_update and not downloaded and not post_changed:
|
||||
# disengage fast_update for first post when resuming
|
||||
if not is_resuming or number > 0:
|
||||
break
|
||||
|
||||
@_requires_login
|
||||
def get_feed_posts(self) -> Iterator[Post]:
|
||||
@ -817,8 +851,10 @@ class Instaloader:
|
||||
"""
|
||||
self.context.log("Retrieving saved posts...")
|
||||
assert self.context.username is not None # safe due to @_requires_login; required by typechecker
|
||||
self.posts_download_loop(Profile.from_username(self.context, self.context.username).get_saved_posts(), ":saved",
|
||||
fast_update, post_filter, max_count=max_count)
|
||||
node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts()
|
||||
self.posts_download_loop(node_iterator, ":saved",
|
||||
fast_update, post_filter,
|
||||
max_count=max_count, total_count=node_iterator.count)
|
||||
|
||||
@_requires_login
|
||||
def get_location_posts(self, location: str) -> Iterator[Post]:
|
||||
@ -873,18 +909,20 @@ class Instaloader:
|
||||
max_count=max_count)
|
||||
|
||||
@_requires_login
|
||||
def get_explore_posts(self) -> Iterator[Post]:
|
||||
def get_explore_posts(self) -> NodeIterator[Post]:
|
||||
"""Get Posts which are worthy of exploring suggested by Instagram.
|
||||
|
||||
:return: Iterator over Posts of the user's suggested posts.
|
||||
:rtype: NodeIterator[Post]
|
||||
:raises LoginRequiredException: If called without being logged in.
|
||||
"""
|
||||
data = self.context.get_json('explore/', {})
|
||||
yield from (Post(self.context, node)
|
||||
for node in self.context.graphql_node_list("df0dcc250c2b18d9fd27c5581ef33c7c",
|
||||
{}, 'https://www.instagram.com/explore/',
|
||||
lambda d: d['data']['user']['edge_web_discover_media'],
|
||||
data.get('rhx_gis')))
|
||||
return NodeIterator(
|
||||
self.context,
|
||||
'df0dcc250c2b18d9fd27c5581ef33c7c',
|
||||
lambda d: d['data']['user']['edge_web_discover_media'],
|
||||
lambda n: Post(self.context, n),
|
||||
query_referer='https://www.instagram.com/explore/',
|
||||
)
|
||||
|
||||
def get_hashtag_posts(self, hashtag: str) -> Iterator[Post]:
|
||||
"""Get Posts associated with a #hashtag.
|
||||
@ -955,7 +993,7 @@ class Instaloader:
|
||||
.. versionadded:: 4.3"""
|
||||
self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
|
||||
self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter,
|
||||
total_count=profile.igtvcount)
|
||||
total_count=profile.igtvcount, owner_profile=profile)
|
||||
|
||||
def _get_id_filename(self, profile_name: str) -> str:
|
||||
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
||||
@ -1110,7 +1148,7 @@ class Instaloader:
|
||||
if posts:
|
||||
self.context.log("Retrieving posts from profile {}.".format(profile_name))
|
||||
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
|
||||
total_count=profile.mediacount)
|
||||
total_count=profile.mediacount, owner_profile=profile)
|
||||
|
||||
if stories and profiles:
|
||||
with self.context.error_catcher("Download stories"):
|
||||
@ -1190,7 +1228,7 @@ class Instaloader:
|
||||
# Iterate over pictures and download them
|
||||
self.context.log("Retrieving posts from profile {}.".format(profile_name))
|
||||
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
|
||||
total_count=profile.mediacount)
|
||||
total_count=profile.mediacount, owner_profile=profile)
|
||||
|
||||
def interactive_login(self, username: str) -> None:
|
||||
"""Logs in and internally stores session, asking user for password interactively.
|
||||
|
@ -428,7 +428,12 @@ class InstaloaderContext:
|
||||
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
|
||||
rhx_gis: Optional[str] = None,
|
||||
first_data: Optional[Dict[str, Any]] = None) -> Iterator[Dict[str, Any]]:
|
||||
"""Retrieve a list of GraphQL nodes."""
|
||||
"""
|
||||
Retrieve a list of GraphQL nodes.
|
||||
|
||||
..deprecated:: 4.5
|
||||
Use :class:`NodeIterator` instead, which provides more functionality.
|
||||
"""
|
||||
|
||||
def _query():
|
||||
query_variables['first'] = self._graphql_page_length
|
||||
|
261
instaloader/nodeiterator.py
Normal file
261
instaloader/nodeiterator.py
Normal file
@ -0,0 +1,261 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from lzma import LZMAError
|
||||
from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar
|
||||
|
||||
from .exceptions import InvalidArgumentException, QueryReturnedBadRequestException
|
||||
from .instaloadercontext import InstaloaderContext
|
||||
|
||||
FrozenNodeIterator = NamedTuple('FrozenNodeIterator',
|
||||
[('query_hash', str),
|
||||
('query_variables', Dict),
|
||||
('query_referer', Optional[str]),
|
||||
('context_username', Optional[str]),
|
||||
('total_index', int),
|
||||
('best_before', Optional[float]),
|
||||
('remaining_data', Optional[Dict])])
|
||||
FrozenNodeIterator.__doc__ = \
|
||||
"""A serializable representation of a :class:`NodeIterator` instance, saving its iteration state."""
|
||||
FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter."""
|
||||
FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter."""
|
||||
FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query."""
|
||||
FrozenNodeIterator.context_username.__doc__ = """The username who created the iterator, or ``None``."""
|
||||
FrozenNodeIterator.total_index.__doc__ = """Number of items that have already been returned."""
|
||||
FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired."""
|
||||
FrozenNodeIterator.remaining_data.__doc__ = \
|
||||
"""The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing."""
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class NodeIterator(Iterator[T]):
|
||||
"""
|
||||
Iterate the nodes within edges in a GraphQL pagination. Instances of this class are returned by many (but not all)
|
||||
of Instaloader's :class:`Post`-returning functions (such as :meth:`Profile.get_posts` etc.).
|
||||
|
||||
What makes this iterator special is its ability to freeze/store its current state, e.g. to interrupt an iteration,
|
||||
and later thaw/resume from where it left off.
|
||||
|
||||
You can freeze a NodeIterator with :meth:`NodeIterator.freeze`::
|
||||
|
||||
post_iterator = profile.get_posts()
|
||||
try:
|
||||
for post in post_iterator:
|
||||
do_something_with(post)
|
||||
except KeyboardInterrupt:
|
||||
save("resume_information.json", post_iterator.freeze())
|
||||
|
||||
and later reuse it with :meth:`NodeIterator.thaw` on an equally-constructed NodeIterator::
|
||||
|
||||
post_iterator = profile.get_posts()
|
||||
post_iterator.thaw(load("resume_information.json"))
|
||||
|
||||
A :class:`FrozenNodeIterator` can only be thawn with a matching NodeIterator, i.e. a NodeIterator instance that has
|
||||
been constructed with the same parameters as the instance that is represented by the :class:`FrozenNodeIterator` in
|
||||
question. This is to ensure that an iteration cannot be resumed in a wrong, unmatching loop. As a quick way to
|
||||
distinguish iterators that are saved e.g. in files, there is the :attr:`NodeIterator.magic` string: Two
|
||||
NodeIterators are matching if they have the same magic.
|
||||
|
||||
See also :func:`resumable_iteration` for a high-level context manager that handles a resumable iteration.
|
||||
"""
|
||||
|
||||
_graphql_page_length = 50
|
||||
shelf_life = timedelta(days=29)
|
||||
|
||||
def __init__(self,
|
||||
context: InstaloaderContext,
|
||||
query_hash: str,
|
||||
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
|
||||
node_wrapper: Callable[[Dict], T],
|
||||
query_variables: Optional[Dict[str, Any]] = None,
|
||||
query_referer: Optional[str] = None,
|
||||
first_data: Optional[Dict[str, Any]] = None):
|
||||
self._context = context
|
||||
self._query_hash = query_hash
|
||||
self._edge_extractor = edge_extractor
|
||||
self._node_wrapper = node_wrapper
|
||||
self._query_variables = query_variables if query_variables is not None else {}
|
||||
self._query_referer = query_referer
|
||||
self._data = first_data
|
||||
self._page_index = 0
|
||||
self._total_index = 0
|
||||
self._best_before = (None if first_data is None else
|
||||
datetime.now() + NodeIterator.shelf_life)
|
||||
|
||||
def _query(self, after: Optional[str] = None) -> Dict:
|
||||
pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any]
|
||||
if after is not None:
|
||||
pagination_variables['after'] = after
|
||||
try:
|
||||
data = self._edge_extractor(
|
||||
self._context.graphql_query(
|
||||
self._query_hash, {**self._query_variables, **pagination_variables}, self._query_referer
|
||||
)
|
||||
)
|
||||
self._best_before = datetime.now() + NodeIterator.shelf_life
|
||||
return data
|
||||
except QueryReturnedBadRequestException:
|
||||
new_page_length = int(NodeIterator._graphql_page_length / 2)
|
||||
if new_page_length >= 12:
|
||||
NodeIterator._graphql_page_length = new_page_length
|
||||
self._context.error("HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.",
|
||||
repeat_at_end=False)
|
||||
return self._query(after)
|
||||
else:
|
||||
raise
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self._data is None:
|
||||
self._data = self._query()
|
||||
if self._page_index < len(self._data['edges']):
|
||||
node = self._data['edges'][self._page_index]['node']
|
||||
page_index, total_index = self._page_index, self._total_index
|
||||
try:
|
||||
self._page_index += 1
|
||||
self._total_index += 1
|
||||
except KeyboardInterrupt:
|
||||
self._page_index, self._total_index = page_index, total_index
|
||||
raise
|
||||
return self._node_wrapper(node)
|
||||
if self._data['page_info']['has_next_page']:
|
||||
query_response = self._query(self._data['page_info']['end_cursor'])
|
||||
page_index, data = self._page_index, self._data
|
||||
try:
|
||||
self._page_index = 0
|
||||
self._data = query_response
|
||||
except KeyboardInterrupt:
|
||||
self._page_index, self._data = page_index, data
|
||||
raise
|
||||
return self.__next__()
|
||||
raise StopIteration()
|
||||
|
||||
@property
|
||||
def count(self) -> Optional[int]:
|
||||
"""The ``count`` as returned by Instagram. This is not always the total count this iterator will yield."""
|
||||
return self._data.get('count') if self._data is not None else None
|
||||
|
||||
@property
|
||||
def total_index(self) -> int:
|
||||
"""Number of items that have already been returned."""
|
||||
return self._total_index
|
||||
|
||||
@property
|
||||
def magic(self) -> str:
|
||||
"""Magic string for easily identifying a matching iterator file for resuming (hash of some parameters)."""
|
||||
if 'blake2b' not in hashlib.algorithms_available:
|
||||
magic_hash = hashlib.new('sha224')
|
||||
else:
|
||||
# Use blake2b when possible, i.e. on Python >= 3.6.
|
||||
magic_hash = hashlib.blake2b(digest_size=6) # type:ignore # pylint: disable=no-member
|
||||
magic_hash.update(json.dumps(
|
||||
[self._query_hash, self._query_variables, self._query_referer, self._context.username]
|
||||
).encode())
|
||||
return base64.urlsafe_b64encode(magic_hash.digest()).decode()
|
||||
|
||||
def freeze(self) -> FrozenNodeIterator:
|
||||
"""Freeze the iterator for later resuming."""
|
||||
remaining_data = None
|
||||
if self._data is not None:
|
||||
remaining_data = {**self._data,
|
||||
'edges': (self._data['edges'][(max(self._page_index - 1, 0)):])}
|
||||
return FrozenNodeIterator(
|
||||
query_hash=self._query_hash,
|
||||
query_variables=self._query_variables,
|
||||
query_referer=self._query_referer,
|
||||
context_username=self._context.username,
|
||||
total_index=max(self.total_index - 1, 0),
|
||||
best_before=self._best_before.timestamp() if self._best_before else None,
|
||||
remaining_data=remaining_data,
|
||||
)
|
||||
|
||||
def thaw(self, frozen: FrozenNodeIterator) -> None:
|
||||
"""Use this iterator for resuming from earlier iteration."""
|
||||
if self._total_index or self._page_index:
|
||||
raise InvalidArgumentException("thaw() called on already-used iterator.")
|
||||
if (self._query_hash != frozen.query_hash or
|
||||
self._query_variables != frozen.query_variables or
|
||||
self._query_referer != frozen.query_referer or
|
||||
self._context.username != frozen.context_username):
|
||||
raise InvalidArgumentException("Mismatching resume information.")
|
||||
self._total_index = frozen.total_index
|
||||
self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None
|
||||
self._data = frozen.remaining_data
|
||||
|
||||
|
||||
@contextmanager
|
||||
def resumable_iteration(context: InstaloaderContext,
|
||||
iterator: Iterator,
|
||||
load: Callable[[InstaloaderContext, str], Any],
|
||||
save: Callable[[FrozenNodeIterator, str], None],
|
||||
format_path: Callable[[str], str],
|
||||
check_bbd: bool = True,
|
||||
enabled: bool = True) -> Iterator[Tuple[bool, int]]:
|
||||
"""
|
||||
High-level context manager to handle a resumable iteration that can be interrupted with a KeyboardInterrupt.
|
||||
|
||||
It can be used as follows to automatically load a previously-saved state into the iterator, save the iterator's
|
||||
state when interrupted, and delete the resume file upon completion::
|
||||
|
||||
post_iterator = profile.get_posts()
|
||||
with resumable_iteration(
|
||||
context=L.context,
|
||||
iterator=post_iterator,
|
||||
load=lambda _, path: FrozenNodeIterator(**json.load(open(path))),
|
||||
save=lambda fni, path: json.dump(fni._asdict(), open(path, 'w')),
|
||||
format_path=lambda magic: "resume_info_{}.json".format(magic)
|
||||
) as resume_info:
|
||||
is_resuming, start_index = resume_info
|
||||
for post in post_iterator:
|
||||
do_something_with(post)
|
||||
|
||||
It yields a tuple (is_resuming, start_index).
|
||||
|
||||
When the passed iterator is not a :class:`NodeIterator`, it behaves as if ``resumable_iteration`` was not used,
|
||||
just executing the inner body.
|
||||
|
||||
:param context: The :class:`InstaloaderContext`.
|
||||
:param iterator: The fresh :class:`NodeIterator`.
|
||||
:param load: Loads a FrozenNodeIterator from given path. The object is ignored if it has a different type.
|
||||
:param save: Saves the given FrozenNodeIterator to the given path.
|
||||
:param format_path: Returns the path to the resume file for the given magic.
|
||||
:param check_bbd: Whether to check the best before date and reject an expired FrozenNodeIterator.
|
||||
:param enabled: Set to False to disable all functionality and simply execute the inner body.
|
||||
"""
|
||||
if not enabled or not isinstance(iterator, NodeIterator):
|
||||
yield False, 0
|
||||
return
|
||||
is_resuming = False
|
||||
start_index = 0
|
||||
resume_file_path = format_path(iterator.magic)
|
||||
resume_file_exists = os.path.isfile(resume_file_path)
|
||||
if resume_file_exists:
|
||||
try:
|
||||
fni = load(context, resume_file_path)
|
||||
if not isinstance(fni, FrozenNodeIterator):
|
||||
raise InvalidArgumentException("Invalid type.")
|
||||
if check_bbd and fni.best_before and datetime.fromtimestamp(fni.best_before) < datetime.now():
|
||||
raise InvalidArgumentException("\"Best before\" date exceeded.")
|
||||
iterator.thaw(fni)
|
||||
is_resuming = True
|
||||
start_index = iterator.total_index
|
||||
context.log("Resuming from {}.".format(resume_file_path))
|
||||
except (InvalidArgumentException, LZMAError, json.decoder.JSONDecodeError) as exc:
|
||||
context.error("Warning: Not resuming from {}: {}".format(resume_file_path, exc))
|
||||
try:
|
||||
yield is_resuming, start_index
|
||||
except KeyboardInterrupt:
|
||||
os.makedirs(os.path.dirname(resume_file_path), exist_ok=True)
|
||||
save(iterator.freeze(), resume_file_path)
|
||||
context.log("\nSaved resume information to {}.".format(resume_file_path))
|
||||
raise
|
||||
if resume_file_exists:
|
||||
os.unlink(resume_file_path)
|
||||
context.log("Iteration complete, deleted resume information file {}.".format(resume_file_path))
|
@ -9,7 +9,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from . import __version__
|
||||
from .exceptions import *
|
||||
from .instaloadercontext import InstaloaderContext
|
||||
|
||||
from .nodeiterator import FrozenNodeIterator, NodeIterator
|
||||
|
||||
PostSidecarNode = namedtuple('PostSidecarNode', ['is_video', 'display_url', 'video_url'])
|
||||
PostSidecarNode.__doc__ = "Item of a Sidecar Post."
|
||||
@ -402,11 +402,14 @@ class Post:
|
||||
# If the answer's metadata already contains all comments, don't do GraphQL requests to obtain them
|
||||
yield from (_postcommentanswer(comment['node']) for comment in answer_edges)
|
||||
return
|
||||
yield from (_postcommentanswer(answer_node) for answer_node in
|
||||
self._context.graphql_node_list("51fdd02b67508306ad4484ff574a0b62",
|
||||
{'comment_id': node['id']},
|
||||
'https://www.instagram.com/p/' + self.shortcode + '/',
|
||||
lambda d: d['data']['comment']['edge_threaded_comments']))
|
||||
yield from NodeIterator(
|
||||
self._context,
|
||||
'51fdd02b67508306ad4484ff574a0b62',
|
||||
lambda d: d['data']['comment']['edge_threaded_comments'],
|
||||
_postcommentanswer,
|
||||
{'comment_id': node['id']},
|
||||
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
|
||||
)
|
||||
|
||||
def _postcomment(node):
|
||||
return PostComment(*_postcommentanswer(node),
|
||||
@ -422,12 +425,14 @@ class Post:
|
||||
# If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them
|
||||
yield from (_postcomment(comment['node']) for comment in comment_edges)
|
||||
return
|
||||
yield from (_postcomment(node) for node in
|
||||
self._context.graphql_node_list(
|
||||
"97b41c52301f77ce508f55e66d17620e",
|
||||
{'shortcode': self.shortcode},
|
||||
'https://www.instagram.com/p/' + self.shortcode + '/',
|
||||
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment']))
|
||||
yield from NodeIterator(
|
||||
self._context,
|
||||
'97b41c52301f77ce508f55e66d17620e',
|
||||
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],
|
||||
_postcomment,
|
||||
{'shortcode': self.shortcode},
|
||||
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
|
||||
)
|
||||
|
||||
def get_likes(self) -> Iterator['Profile']:
|
||||
"""Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded."""
|
||||
@ -439,10 +444,14 @@ class Post:
|
||||
# If the Post's metadata already contains all likes, don't do GraphQL requests to obtain them
|
||||
yield from (Profile(self._context, like['node']) for like in likes_edges)
|
||||
return
|
||||
yield from (Profile(self._context, node) for node in
|
||||
self._context.graphql_node_list("1cb6ec562846122743b61e492c85999f", {'shortcode': self.shortcode},
|
||||
'https://www.instagram.com/p/' + self.shortcode + '/',
|
||||
lambda d: d['data']['shortcode_media']['edge_liked_by']))
|
||||
yield from NodeIterator(
|
||||
self._context,
|
||||
'1cb6ec562846122743b61e492c85999f',
|
||||
lambda d: d['data']['shortcode_media']['edge_liked_by'],
|
||||
lambda n: Profile(self._context, n),
|
||||
{'shortcode': self.shortcode},
|
||||
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
|
||||
)
|
||||
|
||||
@property
|
||||
def is_sponsored(self) -> bool:
|
||||
@ -770,80 +779,110 @@ class Profile:
|
||||
Use :attr:`profile_pic_url`."""
|
||||
return self.profile_pic_url
|
||||
|
||||
def get_posts(self) -> Iterator[Post]:
|
||||
"""Retrieve all posts from a profile."""
|
||||
self._obtain_metadata()
|
||||
yield from (Post(self._context, node, self) for node in
|
||||
self._context.graphql_node_list("472f257a40c653c64c666ce877d59d2b",
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
lambda d: d['data']['user']['edge_owner_to_timeline_media'],
|
||||
first_data=self._metadata('edge_owner_to_timeline_media')))
|
||||
def get_posts(self) -> NodeIterator[Post]:
|
||||
"""Retrieve all posts from a profile.
|
||||
|
||||
def get_saved_posts(self) -> Iterator[Post]:
|
||||
"""Get Posts that are marked as saved by the user."""
|
||||
:rtype:NodeIterator[Post]"""
|
||||
self._obtain_metadata()
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'472f257a40c653c64c666ce877d59d2b',
|
||||
lambda d: d['data']['user']['edge_owner_to_timeline_media'],
|
||||
lambda n: Post(self._context, n, self),
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
self._metadata('edge_owner_to_timeline_media'),
|
||||
)
|
||||
|
||||
def get_saved_posts(self) -> NodeIterator[Post]:
|
||||
"""Get Posts that are marked as saved by the user.
|
||||
|
||||
:rtype:NodeIterator[Post]"""
|
||||
|
||||
if self.username != self._context.username:
|
||||
raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username))
|
||||
|
||||
self._obtain_metadata()
|
||||
yield from (Post(self._context, node) for node in
|
||||
self._context.graphql_node_list("f883d95537fbcd400f466f63d42bd8a1",
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
lambda d: d['data']['user']['edge_saved_media'],
|
||||
first_data=self._metadata('edge_saved_media')))
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'f883d95537fbcd400f466f63d42bd8a1',
|
||||
lambda d: d['data']['user']['edge_saved_media'],
|
||||
lambda n: Post(self._context, n),
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
self._metadata('edge_saved_media'),
|
||||
)
|
||||
|
||||
def get_tagged_posts(self) -> Iterator[Post]:
|
||||
def get_tagged_posts(self) -> NodeIterator[Post]:
|
||||
"""Retrieve all posts where a profile is tagged.
|
||||
|
||||
:rtype: NodeIterator[Post]
|
||||
|
||||
.. versionadded:: 4.0.7"""
|
||||
self._obtain_metadata()
|
||||
yield from (Post(self._context, node, self if int(node['owner']['id']) == self.userid else None) for node in
|
||||
self._context.graphql_node_list("e31a871f7301132ceaab56507a66bbb7",
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
lambda d: d['data']['user']['edge_user_to_photos_of_you']))
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'e31a871f7301132ceaab56507a66bbb7',
|
||||
lambda d: d['data']['user']['edge_user_to_photos_of_you'],
|
||||
lambda n: Post(self._context, n, self if int(n['owner']['id']) == self.userid else None),
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
)
|
||||
|
||||
def get_igtv_posts(self) -> Iterator[Post]:
|
||||
def get_igtv_posts(self) -> NodeIterator[Post]:
|
||||
"""Retrieve all IGTV posts.
|
||||
|
||||
:rtype: NodeIterator[Post]
|
||||
|
||||
.. versionadded:: 4.3"""
|
||||
self._obtain_metadata()
|
||||
yield from (Post(self._context, node, self) for node in
|
||||
self._context.graphql_node_list('bc78b344a68ed16dd5d7f264681c4c76',
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/channel/'.format(self.username),
|
||||
lambda d: d['data']['user']['edge_felix_video_timeline'],
|
||||
first_data=self._metadata('edge_felix_video_timeline')))
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'bc78b344a68ed16dd5d7f264681c4c76',
|
||||
lambda d: d['data']['user']['edge_felix_video_timeline'],
|
||||
lambda n: Post(self._context, n, self),
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/channel/'.format(self.username),
|
||||
self._metadata('edge_felix_video_timeline'),
|
||||
)
|
||||
|
||||
def get_followers(self) -> Iterator['Profile']:
|
||||
def get_followers(self) -> NodeIterator['Profile']:
|
||||
"""
|
||||
Retrieve list of followers of given profile.
|
||||
To use this, one needs to be logged in and private profiles has to be followed.
|
||||
|
||||
:rtype:NodeIterator[Profile]
|
||||
"""
|
||||
if not self._context.is_logged_in:
|
||||
raise LoginRequiredException("--login required to get a profile's followers.")
|
||||
self._obtain_metadata()
|
||||
yield from (Profile(self._context, node) for node in
|
||||
self._context.graphql_node_list("37479f2b8209594dde7facb0d904896a",
|
||||
{'id': str(self.userid)},
|
||||
'https://www.instagram.com/' + self.username + '/',
|
||||
lambda d: d['data']['user']['edge_followed_by']))
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'37479f2b8209594dde7facb0d904896a',
|
||||
lambda d: d['data']['user']['edge_followed_by'],
|
||||
lambda n: Profile(self._context, n),
|
||||
{'id': str(self.userid)},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
)
|
||||
|
||||
def get_followees(self) -> Iterator['Profile']:
|
||||
def get_followees(self) -> NodeIterator['Profile']:
|
||||
"""
|
||||
Retrieve list of followees (followings) of given profile.
|
||||
To use this, one needs to be logged in and private profiles has to be followed.
|
||||
|
||||
:rtype:NodeIterator[Profile]
|
||||
"""
|
||||
if not self._context.is_logged_in:
|
||||
raise LoginRequiredException("--login required to get a profile's followees.")
|
||||
self._obtain_metadata()
|
||||
yield from (Profile(self._context, node) for node in
|
||||
self._context.graphql_node_list("58712303d941c6855d4e888c5f0cd22f",
|
||||
{'id': str(self.userid)},
|
||||
'https://www.instagram.com/' + self.username + '/',
|
||||
lambda d: d['data']['user']['edge_follow']))
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'58712303d941c6855d4e888c5f0cd22f',
|
||||
lambda d: d['data']['user']['edge_follow'],
|
||||
lambda n: Profile(self._context, n),
|
||||
{'id': str(self.userid)},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
)
|
||||
|
||||
def get_similar_accounts(self) -> Iterator['Profile']:
|
||||
"""
|
||||
@ -1398,7 +1437,7 @@ class TopSearchResults:
|
||||
return self._searchstring
|
||||
|
||||
|
||||
JsonExportable = Union[Post, Profile, StoryItem, Hashtag]
|
||||
JsonExportable = Union[Post, Profile, StoryItem, Hashtag, FrozenNodeIterator]
|
||||
|
||||
|
||||
def save_structure_to_file(structure: JsonExportable, filename: str) -> None:
|
||||
@ -1447,6 +1486,8 @@ def load_structure_from_file(context: InstaloaderContext, filename: str) -> Json
|
||||
return StoryItem(context, json_structure['node'])
|
||||
elif node_type == "Hashtag":
|
||||
return Hashtag(context, json_structure['node'])
|
||||
elif node_type == "FrozenNodeIterator":
|
||||
return FrozenNodeIterator(**json_structure['node'])
|
||||
else:
|
||||
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))
|
||||
elif 'shortcode' in json_structure:
|
||||
|
Loading…
Reference in New Issue
Block a user