Fix interruped downloads with --latest-stamps (#1219)

The most recent post is cached in NodeIterator (and
saved to the disk), and its timestamp is used, instead of the
timestamp instaloader was run.

This way, even in later resuming runs the timestamp stored is the same
that would have been stored if the first run.

Fixes #1206.
This commit is contained in:
Eduardo Kalinowski 2021-07-24 14:27:46 -03:00 committed by GitHub
parent 26c9f71132
commit 9cdf679fc1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 15 deletions

View File

@ -1203,14 +1203,14 @@ class Instaloader:
if latest_stamps is not None: if latest_stamps is not None:
last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username) last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username)
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
scraped_timestamp = datetime.now().astimezone() tagged_posts = profile.get_tagged_posts()
self.posts_download_loop(profile.get_tagged_posts(), self.posts_download_loop(tagged_posts,
target if target target if target
else (Path(_PostPathFormatter.sanitize_path(profile.username)) / else (Path(_PostPathFormatter.sanitize_path(profile.username)) /
_PostPathFormatter.sanitize_path(':tagged')), _PostPathFormatter.sanitize_path(':tagged')),
fast_update, post_filter, takewhile=posts_takewhile) fast_update, post_filter, takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None and tagged_posts.first_item is not None:
latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp) latest_stamps.set_last_tagged_timestamp(profile.username, tagged_posts.first_item.date_local.astimezone())
def download_igtv(self, profile: Profile, fast_update: bool = False, def download_igtv(self, profile: Profile, fast_update: bool = False,
post_filter: Optional[Callable[[Post], bool]] = None, post_filter: Optional[Callable[[Post], bool]] = None,
@ -1226,11 +1226,11 @@ class Instaloader:
if latest_stamps is not None: if latest_stamps is not None:
last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username) last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username)
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
scraped_timestamp = datetime.now().astimezone() igtv_posts = profile.get_igtv_posts()
self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter, self.posts_download_loop(igtv_posts, profile.username, fast_update, post_filter,
total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile) total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None and igtv_posts.first_item is not None:
latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp) latest_stamps.set_last_igtv_timestamp(profile.username, igtv_posts.first_item.date_local.astimezone())
def _get_id_filename(self, profile_name: str) -> str: def _get_id_filename(self, profile_name: str) -> str:
if ((format_string_contains_key(self.dirname_pattern, 'profile') or if ((format_string_contains_key(self.dirname_pattern, 'profile') or
@ -1424,12 +1424,13 @@ class Instaloader:
# pylint:disable=cell-var-from-loop # pylint:disable=cell-var-from-loop
last_scraped = latest_stamps.get_last_post_timestamp(profile_name) last_scraped = latest_stamps.get_last_post_timestamp(profile_name)
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
scraped_timestamp = datetime.now().astimezone() posts_to_download = profile.get_posts()
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter, self.posts_download_loop(posts_to_download, profile_name, fast_update, post_filter,
total_count=profile.mediacount, owner_profile=profile, total_count=profile.mediacount, owner_profile=profile,
takewhile=posts_takewhile) takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None and posts_to_download.first_item is not None:
latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp) latest_stamps.set_last_post_timestamp(profile_name,
posts_to_download.first_item.date_local.astimezone())
if stories and profiles: if stories and profiles:
with self.context.error_catcher("Download stories"): with self.context.error_catcher("Download stories"):

View File

@ -17,7 +17,8 @@ FrozenNodeIterator = NamedTuple('FrozenNodeIterator',
('context_username', Optional[str]), ('context_username', Optional[str]),
('total_index', int), ('total_index', int),
('best_before', Optional[float]), ('best_before', Optional[float]),
('remaining_data', Optional[Dict])]) ('remaining_data', Optional[Dict]),
('first_node', Optional[Dict])])
FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter.""" FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter."""
FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter.""" FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter."""
FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query.""" FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query."""
@ -26,7 +27,7 @@ FrozenNodeIterator.total_index.__doc__ = """Number of items that have already be
FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired.""" FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired."""
FrozenNodeIterator.remaining_data.__doc__ = \ FrozenNodeIterator.remaining_data.__doc__ = \
"""The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing.""" """The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing."""
FrozenNodeIterator.first_node.__doc__ = """Node data of the first item, if an item has been produced."""
T = TypeVar('T') T = TypeVar('T')
@ -89,6 +90,7 @@ class NodeIterator(Iterator[T]):
self._best_before = datetime.now() + NodeIterator._shelf_life self._best_before = datetime.now() + NodeIterator._shelf_life
else: else:
self._data = self._query() self._data = self._query()
self._first_node: Optional[Dict] = None
def _query(self, after: Optional[str] = None) -> Dict: def _query(self, after: Optional[str] = None) -> Dict:
pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any] pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any]
@ -125,7 +127,10 @@ class NodeIterator(Iterator[T]):
except KeyboardInterrupt: except KeyboardInterrupt:
self._page_index, self._total_index = page_index, total_index self._page_index, self._total_index = page_index, total_index
raise raise
return self._node_wrapper(node) item = self._node_wrapper(node)
if self._first_node is None:
self._first_node = node
return item
if self._data['page_info']['has_next_page']: if self._data['page_info']['has_next_page']:
query_response = self._query(self._data['page_info']['end_cursor']) query_response = self._query(self._data['page_info']['end_cursor'])
page_index, data = self._page_index, self._data page_index, data = self._page_index, self._data
@ -157,6 +162,15 @@ class NodeIterator(Iterator[T]):
).encode()) ).encode())
return base64.urlsafe_b64encode(magic_hash.digest()).decode() return base64.urlsafe_b64encode(magic_hash.digest()).decode()
@property
def first_item(self) -> Optional[T]:
"""
If this iterator has produced any items, returns the first item produced.
.. versionadded:: 4.8
"""
return self._node_wrapper(self._first_node) if self._first_node is not None else None
def freeze(self) -> FrozenNodeIterator: def freeze(self) -> FrozenNodeIterator:
"""Freeze the iterator for later resuming.""" """Freeze the iterator for later resuming."""
remaining_data = None remaining_data = None
@ -171,6 +185,7 @@ class NodeIterator(Iterator[T]):
total_index=max(self.total_index - 1, 0), total_index=max(self.total_index - 1, 0),
best_before=self._best_before.timestamp() if self._best_before else None, best_before=self._best_before.timestamp() if self._best_before else None,
remaining_data=remaining_data, remaining_data=remaining_data,
first_node=self._first_node,
) )
def thaw(self, frozen: FrozenNodeIterator) -> None: def thaw(self, frozen: FrozenNodeIterator) -> None:
@ -197,6 +212,8 @@ class NodeIterator(Iterator[T]):
self._total_index = frozen.total_index self._total_index = frozen.total_index
self._best_before = datetime.fromtimestamp(frozen.best_before) self._best_before = datetime.fromtimestamp(frozen.best_before)
self._data = frozen.remaining_data self._data = frozen.remaining_data
if frozen.first_node is not None:
self._first_node = frozen.first_node
@contextmanager @contextmanager

View File

@ -1643,6 +1643,8 @@ def load_structure(context: InstaloaderContext, json_structure: dict) -> JsonExp
elif node_type == "Hashtag": elif node_type == "Hashtag":
return Hashtag(context, json_structure['node']) return Hashtag(context, json_structure['node'])
elif node_type == "FrozenNodeIterator": elif node_type == "FrozenNodeIterator":
if not 'first_node' in json_structure['node']:
json_structure['node']['first_node'] = None
return FrozenNodeIterator(**json_structure['node']) return FrozenNodeIterator(**json_structure['node'])
elif 'shortcode' in json_structure: elif 'shortcode' in json_structure:
# Post JSON created with Instaloader v3 # Post JSON created with Instaloader v3