Merge branch 'master' into upcoming/v4.6

This commit is contained in:
Alexander Graf
2020-11-28 19:00:49 +01:00
12 changed files with 274 additions and 68 deletions

View File

@@ -1,7 +1,7 @@
"""Download pictures (or videos) along with their captions and other metadata from Instagram."""
__version__ = '4.5.1'
__version__ = '4.5.4'
try:

View File

@@ -514,15 +514,16 @@ class Instaloader:
# Download the image(s) / video thumbnail and videos within sidecars if desired
downloaded = True
if post.typename == 'GraphSidecar':
for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
# Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
mtime=post.date_local, filename_suffix=str(edge_number))
if sidecar_node.is_video and self.download_videos:
# Download sidecar video if desired
downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
mtime=post.date_local, filename_suffix=str(edge_number))
if self.download_pictures or self.download_videos:
for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
# Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
mtime=post.date_local, filename_suffix=str(edge_number))
if sidecar_node.is_video and self.download_videos:
# Download sidecar video if desired
downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
mtime=post.date_local, filename_suffix=str(edge_number))
elif post.typename == 'GraphImage':
# Download picture
if self.download_pictures:
@@ -578,12 +579,12 @@ class Instaloader:
def _userid_chunks():
assert userids is not None
userids_per_query = 100
userids_per_query = 50
for i in range(0, len(userids), userids_per_query):
yield userids[i:i + userids_per_query]
for userid_chunk in _userid_chunks():
stories = self.context.graphql_query("bf41e22b1c4ba4c9f31b844ebb7d9056",
stories = self.context.graphql_query("303a4ae99711322310f25250d988f3b7",
{"reel_ids": userid_chunk, "precomposed_overlay": False})["data"]
yield from (Story(self.context, media) for media in stories['reels_media'])
@@ -856,7 +857,7 @@ class Instaloader:
"""
self.context.log("Retrieving saved posts...")
assert self.context.username is not None # safe due to @_requires_login; required by typechecker
node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts()
node_iterator = Profile.own_profile(self.context).get_saved_posts()
self.posts_download_loop(node_iterator, ":saved",
fast_update, post_filter,
max_count=max_count, total_count=node_iterator.count)

View File

@@ -347,10 +347,10 @@ class InstaloaderContext:
raise ConnectionException("\"window._sharedData\" does not contain required keys.")
# If GraphQL data is missing in `window._sharedData`, search for it in `__additionalDataLoaded`.
if 'graphql' not in post_or_profile_page[0]:
match = re.search(r'window\.__additionalDataLoaded\([^{]+{"graphql":({.*})}\);</script>',
match = re.search(r'window\.__additionalDataLoaded\(.*?({.*"graphql":.*})\);</script>',
resp.text)
if match is not None:
post_or_profile_page[0]['graphql'] = json.loads(match.group(1))
post_or_profile_page[0]['graphql'] = json.loads(match.group(1))['graphql']
return resp_json
else:
resp_json = resp.json()
@@ -545,8 +545,8 @@ class RateController:
def __init__(self, context: InstaloaderContext):
self._context = context
self._graphql_query_timestamps = dict() # type: Dict[str, List[float]]
self._graphql_earliest_next_request_time = 0.0
self._query_timestamps = dict() # type: Dict[str, List[float]]
self._earliest_next_request_time = 0.0
def sleep(self, secs: float):
"""Wait given number of seconds."""
@@ -556,11 +556,11 @@ class RateController:
time.sleep(secs)
def _dump_query_timestamps(self, current_time: float, failed_query_type: str):
windows = [10, 11, 15, 20, 30, 60]
windows = [10, 11, 20, 22, 30, 60]
self._context.error("Requests within last {} minutes grouped by type:"
.format('/'.join(str(w) for w in windows)),
repeat_at_end=False)
for query_type, times in self._graphql_query_timestamps.items():
for query_type, times in self._query_timestamps.items():
reqs_in_sliding_window = [sum(t > current_time - w * 60 for t in times) for w in windows]
self._context.error(" {} {:>32}: {}".format(
"*" if query_type == failed_query_type else " ",
@@ -569,28 +569,61 @@ class RateController:
), repeat_at_end=False)
def count_per_sliding_window(self, query_type: str) -> int:
"""Return how many GraphQL requests can be done within the sliding window."""
"""Return how many requests can be done within the sliding window."""
# Not static, to allow for the count_per_sliding_window to depend on context-inherent properties, such as
# whether we are logged in.
# pylint:disable=no-self-use,unused-argument
return 200
# pylint:disable=no-self-use
return 75 if query_type in ['iphone', 'other'] else 200
def _reqs_in_sliding_window(self, query_type: Optional[str], current_time: float, window: float) -> List[float]:
if query_type is not None:
# timestamps of type query_type
relevant_timestamps = self._query_timestamps[query_type]
else:
# all GraphQL queries, i.e. not 'iphone' or 'other'
graphql_query_timestamps = filter(lambda tp: tp[0] not in ['iphone', 'other'],
self._query_timestamps.items())
relevant_timestamps = [t for times in (tp[1] for tp in graphql_query_timestamps) for t in times]
return list(filter(lambda t: t > current_time - window, relevant_timestamps))
def query_waittime(self, query_type: str, current_time: float, untracked_queries: bool = False) -> float:
"""Calculate time needed to wait before GraphQL query can be executed."""
sliding_window = 660
if query_type not in self._graphql_query_timestamps:
self._graphql_query_timestamps[query_type] = []
self._graphql_query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
self._graphql_query_timestamps[query_type]))
reqs_in_sliding_window = list(filter(lambda t: t > current_time - sliding_window,
self._graphql_query_timestamps[query_type]))
count_per_sliding_window = self.count_per_sliding_window(query_type)
if len(reqs_in_sliding_window) < count_per_sliding_window and not untracked_queries:
return max(0.0, self._graphql_earliest_next_request_time - current_time)
next_request_time = min(reqs_in_sliding_window) + sliding_window + 6
if untracked_queries:
self._graphql_earliest_next_request_time = next_request_time
return max(next_request_time, self._graphql_earliest_next_request_time) - current_time
"""Calculate time needed to wait before query can be executed."""
per_type_sliding_window = 660
if query_type not in self._query_timestamps:
self._query_timestamps[query_type] = []
self._query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
self._query_timestamps[query_type]))
def per_type_next_request_time():
reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
if len(reqs_in_sliding_window) < self.count_per_sliding_window(query_type):
return 0.0
else:
return min(reqs_in_sliding_window) + per_type_sliding_window + 6
def gql_accumulated_next_request_time():
if query_type in ['iphone', 'other']:
return 0.0
gql_accumulated_sliding_window = 600
gql_accumulated_max_count = 275
reqs_in_sliding_window = self._reqs_in_sliding_window(None, current_time, gql_accumulated_sliding_window)
if len(reqs_in_sliding_window) < gql_accumulated_max_count:
return 0.0
else:
return min(reqs_in_sliding_window) + gql_accumulated_sliding_window
def untracked_next_request_time():
if untracked_queries:
reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
self._earliest_next_request_time = min(reqs_in_sliding_window) + per_type_sliding_window + 6
return self._earliest_next_request_time
return max(0.0,
max(
per_type_next_request_time(),
gql_accumulated_next_request_time(),
untracked_next_request_time(),
) - current_time)
def wait_before_query(self, query_type: str) -> None:
"""This method is called before a query to Instagram. It calls :meth:`RateController.sleep` to wait
@@ -602,10 +635,10 @@ class RateController:
.format(round(waittime), datetime.now() + timedelta(seconds=waittime)))
if waittime > 0:
self.sleep(waittime)
if query_type not in self._graphql_query_timestamps:
self._graphql_query_timestamps[query_type] = [time.monotonic()]
if query_type not in self._query_timestamps:
self._query_timestamps[query_type] = [time.monotonic()]
else:
self._graphql_query_timestamps[query_type].append(time.monotonic())
self._query_timestamps[query_type].append(time.monotonic())
def handle_429(self, query_type: str) -> None:
"""This method is called to handle a 429 Too Many Requests response. It calls :meth:`RateController.sleep` to

View File

@@ -81,11 +81,13 @@ class NodeIterator(Iterator[T]):
self._node_wrapper = node_wrapper
self._query_variables = query_variables if query_variables is not None else {}
self._query_referer = query_referer
self._data = first_data
self._page_index = 0
self._total_index = 0
self._best_before = (None if first_data is None else
datetime.now() + NodeIterator._shelf_life)
if first_data is not None:
self._data = first_data
self._best_before = datetime.now() + NodeIterator._shelf_life
else:
self._data = self._query()
def _query(self, after: Optional[str] = None) -> Dict:
pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any]
@@ -113,8 +115,6 @@ class NodeIterator(Iterator[T]):
return self
def __next__(self) -> T:
if self._data is None:
self._data = self._query()
if self._page_index < len(self._data['edges']):
node = self._data['edges'][self._page_index]['node']
page_index, total_index = self._page_index, self._total_index
@@ -193,8 +193,12 @@ class NodeIterator(Iterator[T]):
self._query_referer != frozen.query_referer or
self._context.username != frozen.context_username):
raise InvalidArgumentException("Mismatching resume information.")
if not frozen.best_before:
raise InvalidArgumentException("\"best before\" date missing.")
if frozen.remaining_data is None:
raise InvalidArgumentException("\"remaining_data\" missing.")
self._total_index = frozen.total_index
self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None
self._best_before = datetime.fromtimestamp(frozen.best_before)
self._data = frozen.remaining_data

View File

@@ -147,10 +147,6 @@ class Post:
)
self._full_metadata_dict = pic_json['data']['shortcode_media']
if self._full_metadata_dict is None:
# issue #449
self._context.error("Fetching Post metadata failed (issue #449). "
"The following data has been returned:\n"
+ json.dumps(pic_json['entry_data'], indent=2))
raise BadResponseException("Fetching Post metadata failed.")
if self.shortcode != self._full_metadata_dict['shortcode']:
self._node.update(self._full_metadata_dict)
@@ -207,7 +203,13 @@ class Post:
@property
def owner_id(self) -> int:
"""The ID of the Post's owner."""
return self.owner_profile.userid
# The ID may already be available, e.g. if the post instance was created
# from an `hashtag.get_posts()` iterator, so no need to make another
# http request.
if 'owner' in self._node and 'id' in self._node['owner']:
return self._node['owner']['id']
else:
return self.owner_profile.userid
@property
def date_local(self) -> datetime:
@@ -441,7 +443,14 @@ class Post:
)
def get_likes(self) -> Iterator['Profile']:
"""Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded."""
"""
Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded.
.. versionchanged:: 4.5.4
Require being logged in (as required by Instagram).
"""
if not self._context.is_logged_in:
raise LoginRequiredException("--login required to access likes of a post.")
if self.likes == 0:
# Avoid doing additional requests if there are no comments
return
@@ -555,7 +564,7 @@ class Profile:
"""
# pylint:disable=protected-access
profile = cls(context, {'username': username.lower()})
profile._obtain_metadata() # to raise ProfileNotExistException now in case username is invalid
profile._obtain_metadata() # to raise ProfileNotExistsException now in case username is invalid
return profile
@classmethod
@@ -585,6 +594,17 @@ class Profile:
context.profile_id_cache[profile_id] = profile
return profile
@classmethod
def own_profile(cls, context: InstaloaderContext):
"""Return own profile if logged-in.
:param context: :attr:`Instaloader.context`
.. versionadded:: 4.5.2"""
if not context.is_logged_in:
raise LoginRequiredException("--login required to access own profile.")
return cls(context, context.graphql_query("d6f4427fbe92d846298cf93df0b937d3", {})["data"]["user"])
def _asdict(self):
json_node = self._node.copy()
# remove posts to avoid "Circular reference detected" exception
@@ -808,7 +828,6 @@ class Profile:
if self.username != self._context.username:
raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username))
self._obtain_metadata()
return NodeIterator(
self._context,
'f883d95537fbcd400f466f63d42bd8a1',
@@ -816,7 +835,6 @@ class Profile:
lambda n: Post(self._context, n),
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
self._metadata('edge_saved_media'),
)
def get_tagged_posts(self) -> NodeIterator[Post]:
@@ -1369,9 +1387,13 @@ class Hashtag:
next_other = next(other_posts, None)
while next_top is not None or next_other is not None:
if next_other is None:
assert next_top is not None
yield next_top
yield from sorted_top_posts
break
if next_top is None:
assert next_other is not None
yield next_other
yield from other_posts
break
if next_top == next_other: