Merge branch 'master' into upcoming/v4.6
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
"""Download pictures (or videos) along with their captions and other metadata from Instagram."""
|
||||
|
||||
|
||||
__version__ = '4.5.1'
|
||||
__version__ = '4.5.4'
|
||||
|
||||
|
||||
try:
|
||||
|
@@ -514,15 +514,16 @@ class Instaloader:
|
||||
# Download the image(s) / video thumbnail and videos within sidecars if desired
|
||||
downloaded = True
|
||||
if post.typename == 'GraphSidecar':
|
||||
for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
|
||||
if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
|
||||
# Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
|
||||
downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
|
||||
mtime=post.date_local, filename_suffix=str(edge_number))
|
||||
if sidecar_node.is_video and self.download_videos:
|
||||
# Download sidecar video if desired
|
||||
downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
|
||||
mtime=post.date_local, filename_suffix=str(edge_number))
|
||||
if self.download_pictures or self.download_videos:
|
||||
for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
|
||||
if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
|
||||
# Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
|
||||
downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
|
||||
mtime=post.date_local, filename_suffix=str(edge_number))
|
||||
if sidecar_node.is_video and self.download_videos:
|
||||
# Download sidecar video if desired
|
||||
downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
|
||||
mtime=post.date_local, filename_suffix=str(edge_number))
|
||||
elif post.typename == 'GraphImage':
|
||||
# Download picture
|
||||
if self.download_pictures:
|
||||
@@ -578,12 +579,12 @@ class Instaloader:
|
||||
|
||||
def _userid_chunks():
|
||||
assert userids is not None
|
||||
userids_per_query = 100
|
||||
userids_per_query = 50
|
||||
for i in range(0, len(userids), userids_per_query):
|
||||
yield userids[i:i + userids_per_query]
|
||||
|
||||
for userid_chunk in _userid_chunks():
|
||||
stories = self.context.graphql_query("bf41e22b1c4ba4c9f31b844ebb7d9056",
|
||||
stories = self.context.graphql_query("303a4ae99711322310f25250d988f3b7",
|
||||
{"reel_ids": userid_chunk, "precomposed_overlay": False})["data"]
|
||||
yield from (Story(self.context, media) for media in stories['reels_media'])
|
||||
|
||||
@@ -856,7 +857,7 @@ class Instaloader:
|
||||
"""
|
||||
self.context.log("Retrieving saved posts...")
|
||||
assert self.context.username is not None # safe due to @_requires_login; required by typechecker
|
||||
node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts()
|
||||
node_iterator = Profile.own_profile(self.context).get_saved_posts()
|
||||
self.posts_download_loop(node_iterator, ":saved",
|
||||
fast_update, post_filter,
|
||||
max_count=max_count, total_count=node_iterator.count)
|
||||
|
@@ -347,10 +347,10 @@ class InstaloaderContext:
|
||||
raise ConnectionException("\"window._sharedData\" does not contain required keys.")
|
||||
# If GraphQL data is missing in `window._sharedData`, search for it in `__additionalDataLoaded`.
|
||||
if 'graphql' not in post_or_profile_page[0]:
|
||||
match = re.search(r'window\.__additionalDataLoaded\([^{]+{"graphql":({.*})}\);</script>',
|
||||
match = re.search(r'window\.__additionalDataLoaded\(.*?({.*"graphql":.*})\);</script>',
|
||||
resp.text)
|
||||
if match is not None:
|
||||
post_or_profile_page[0]['graphql'] = json.loads(match.group(1))
|
||||
post_or_profile_page[0]['graphql'] = json.loads(match.group(1))['graphql']
|
||||
return resp_json
|
||||
else:
|
||||
resp_json = resp.json()
|
||||
@@ -545,8 +545,8 @@ class RateController:
|
||||
|
||||
def __init__(self, context: InstaloaderContext):
|
||||
self._context = context
|
||||
self._graphql_query_timestamps = dict() # type: Dict[str, List[float]]
|
||||
self._graphql_earliest_next_request_time = 0.0
|
||||
self._query_timestamps = dict() # type: Dict[str, List[float]]
|
||||
self._earliest_next_request_time = 0.0
|
||||
|
||||
def sleep(self, secs: float):
|
||||
"""Wait given number of seconds."""
|
||||
@@ -556,11 +556,11 @@ class RateController:
|
||||
time.sleep(secs)
|
||||
|
||||
def _dump_query_timestamps(self, current_time: float, failed_query_type: str):
|
||||
windows = [10, 11, 15, 20, 30, 60]
|
||||
windows = [10, 11, 20, 22, 30, 60]
|
||||
self._context.error("Requests within last {} minutes grouped by type:"
|
||||
.format('/'.join(str(w) for w in windows)),
|
||||
repeat_at_end=False)
|
||||
for query_type, times in self._graphql_query_timestamps.items():
|
||||
for query_type, times in self._query_timestamps.items():
|
||||
reqs_in_sliding_window = [sum(t > current_time - w * 60 for t in times) for w in windows]
|
||||
self._context.error(" {} {:>32}: {}".format(
|
||||
"*" if query_type == failed_query_type else " ",
|
||||
@@ -569,28 +569,61 @@ class RateController:
|
||||
), repeat_at_end=False)
|
||||
|
||||
def count_per_sliding_window(self, query_type: str) -> int:
|
||||
"""Return how many GraphQL requests can be done within the sliding window."""
|
||||
"""Return how many requests can be done within the sliding window."""
|
||||
# Not static, to allow for the count_per_sliding_window to depend on context-inherent properties, such as
|
||||
# whether we are logged in.
|
||||
# pylint:disable=no-self-use,unused-argument
|
||||
return 200
|
||||
# pylint:disable=no-self-use
|
||||
return 75 if query_type in ['iphone', 'other'] else 200
|
||||
|
||||
def _reqs_in_sliding_window(self, query_type: Optional[str], current_time: float, window: float) -> List[float]:
|
||||
if query_type is not None:
|
||||
# timestamps of type query_type
|
||||
relevant_timestamps = self._query_timestamps[query_type]
|
||||
else:
|
||||
# all GraphQL queries, i.e. not 'iphone' or 'other'
|
||||
graphql_query_timestamps = filter(lambda tp: tp[0] not in ['iphone', 'other'],
|
||||
self._query_timestamps.items())
|
||||
relevant_timestamps = [t for times in (tp[1] for tp in graphql_query_timestamps) for t in times]
|
||||
return list(filter(lambda t: t > current_time - window, relevant_timestamps))
|
||||
|
||||
def query_waittime(self, query_type: str, current_time: float, untracked_queries: bool = False) -> float:
|
||||
"""Calculate time needed to wait before GraphQL query can be executed."""
|
||||
sliding_window = 660
|
||||
if query_type not in self._graphql_query_timestamps:
|
||||
self._graphql_query_timestamps[query_type] = []
|
||||
self._graphql_query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
|
||||
self._graphql_query_timestamps[query_type]))
|
||||
reqs_in_sliding_window = list(filter(lambda t: t > current_time - sliding_window,
|
||||
self._graphql_query_timestamps[query_type]))
|
||||
count_per_sliding_window = self.count_per_sliding_window(query_type)
|
||||
if len(reqs_in_sliding_window) < count_per_sliding_window and not untracked_queries:
|
||||
return max(0.0, self._graphql_earliest_next_request_time - current_time)
|
||||
next_request_time = min(reqs_in_sliding_window) + sliding_window + 6
|
||||
if untracked_queries:
|
||||
self._graphql_earliest_next_request_time = next_request_time
|
||||
return max(next_request_time, self._graphql_earliest_next_request_time) - current_time
|
||||
"""Calculate time needed to wait before query can be executed."""
|
||||
per_type_sliding_window = 660
|
||||
if query_type not in self._query_timestamps:
|
||||
self._query_timestamps[query_type] = []
|
||||
self._query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
|
||||
self._query_timestamps[query_type]))
|
||||
|
||||
def per_type_next_request_time():
|
||||
reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
|
||||
if len(reqs_in_sliding_window) < self.count_per_sliding_window(query_type):
|
||||
return 0.0
|
||||
else:
|
||||
return min(reqs_in_sliding_window) + per_type_sliding_window + 6
|
||||
|
||||
def gql_accumulated_next_request_time():
|
||||
if query_type in ['iphone', 'other']:
|
||||
return 0.0
|
||||
gql_accumulated_sliding_window = 600
|
||||
gql_accumulated_max_count = 275
|
||||
reqs_in_sliding_window = self._reqs_in_sliding_window(None, current_time, gql_accumulated_sliding_window)
|
||||
if len(reqs_in_sliding_window) < gql_accumulated_max_count:
|
||||
return 0.0
|
||||
else:
|
||||
return min(reqs_in_sliding_window) + gql_accumulated_sliding_window
|
||||
|
||||
def untracked_next_request_time():
|
||||
if untracked_queries:
|
||||
reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
|
||||
self._earliest_next_request_time = min(reqs_in_sliding_window) + per_type_sliding_window + 6
|
||||
return self._earliest_next_request_time
|
||||
|
||||
return max(0.0,
|
||||
max(
|
||||
per_type_next_request_time(),
|
||||
gql_accumulated_next_request_time(),
|
||||
untracked_next_request_time(),
|
||||
) - current_time)
|
||||
|
||||
def wait_before_query(self, query_type: str) -> None:
|
||||
"""This method is called before a query to Instagram. It calls :meth:`RateController.sleep` to wait
|
||||
@@ -602,10 +635,10 @@ class RateController:
|
||||
.format(round(waittime), datetime.now() + timedelta(seconds=waittime)))
|
||||
if waittime > 0:
|
||||
self.sleep(waittime)
|
||||
if query_type not in self._graphql_query_timestamps:
|
||||
self._graphql_query_timestamps[query_type] = [time.monotonic()]
|
||||
if query_type not in self._query_timestamps:
|
||||
self._query_timestamps[query_type] = [time.monotonic()]
|
||||
else:
|
||||
self._graphql_query_timestamps[query_type].append(time.monotonic())
|
||||
self._query_timestamps[query_type].append(time.monotonic())
|
||||
|
||||
def handle_429(self, query_type: str) -> None:
|
||||
"""This method is called to handle a 429 Too Many Requests response. It calls :meth:`RateController.sleep` to
|
||||
|
@@ -81,11 +81,13 @@ class NodeIterator(Iterator[T]):
|
||||
self._node_wrapper = node_wrapper
|
||||
self._query_variables = query_variables if query_variables is not None else {}
|
||||
self._query_referer = query_referer
|
||||
self._data = first_data
|
||||
self._page_index = 0
|
||||
self._total_index = 0
|
||||
self._best_before = (None if first_data is None else
|
||||
datetime.now() + NodeIterator._shelf_life)
|
||||
if first_data is not None:
|
||||
self._data = first_data
|
||||
self._best_before = datetime.now() + NodeIterator._shelf_life
|
||||
else:
|
||||
self._data = self._query()
|
||||
|
||||
def _query(self, after: Optional[str] = None) -> Dict:
|
||||
pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any]
|
||||
@@ -113,8 +115,6 @@ class NodeIterator(Iterator[T]):
|
||||
return self
|
||||
|
||||
def __next__(self) -> T:
|
||||
if self._data is None:
|
||||
self._data = self._query()
|
||||
if self._page_index < len(self._data['edges']):
|
||||
node = self._data['edges'][self._page_index]['node']
|
||||
page_index, total_index = self._page_index, self._total_index
|
||||
@@ -193,8 +193,12 @@ class NodeIterator(Iterator[T]):
|
||||
self._query_referer != frozen.query_referer or
|
||||
self._context.username != frozen.context_username):
|
||||
raise InvalidArgumentException("Mismatching resume information.")
|
||||
if not frozen.best_before:
|
||||
raise InvalidArgumentException("\"best before\" date missing.")
|
||||
if frozen.remaining_data is None:
|
||||
raise InvalidArgumentException("\"remaining_data\" missing.")
|
||||
self._total_index = frozen.total_index
|
||||
self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None
|
||||
self._best_before = datetime.fromtimestamp(frozen.best_before)
|
||||
self._data = frozen.remaining_data
|
||||
|
||||
|
||||
|
@@ -147,10 +147,6 @@ class Post:
|
||||
)
|
||||
self._full_metadata_dict = pic_json['data']['shortcode_media']
|
||||
if self._full_metadata_dict is None:
|
||||
# issue #449
|
||||
self._context.error("Fetching Post metadata failed (issue #449). "
|
||||
"The following data has been returned:\n"
|
||||
+ json.dumps(pic_json['entry_data'], indent=2))
|
||||
raise BadResponseException("Fetching Post metadata failed.")
|
||||
if self.shortcode != self._full_metadata_dict['shortcode']:
|
||||
self._node.update(self._full_metadata_dict)
|
||||
@@ -207,7 +203,13 @@ class Post:
|
||||
@property
|
||||
def owner_id(self) -> int:
|
||||
"""The ID of the Post's owner."""
|
||||
return self.owner_profile.userid
|
||||
# The ID may already be available, e.g. if the post instance was created
|
||||
# from an `hashtag.get_posts()` iterator, so no need to make another
|
||||
# http request.
|
||||
if 'owner' in self._node and 'id' in self._node['owner']:
|
||||
return self._node['owner']['id']
|
||||
else:
|
||||
return self.owner_profile.userid
|
||||
|
||||
@property
|
||||
def date_local(self) -> datetime:
|
||||
@@ -441,7 +443,14 @@ class Post:
|
||||
)
|
||||
|
||||
def get_likes(self) -> Iterator['Profile']:
|
||||
"""Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded."""
|
||||
"""
|
||||
Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded.
|
||||
|
||||
.. versionchanged:: 4.5.4
|
||||
Require being logged in (as required by Instagram).
|
||||
"""
|
||||
if not self._context.is_logged_in:
|
||||
raise LoginRequiredException("--login required to access likes of a post.")
|
||||
if self.likes == 0:
|
||||
# Avoid doing additional requests if there are no comments
|
||||
return
|
||||
@@ -555,7 +564,7 @@ class Profile:
|
||||
"""
|
||||
# pylint:disable=protected-access
|
||||
profile = cls(context, {'username': username.lower()})
|
||||
profile._obtain_metadata() # to raise ProfileNotExistException now in case username is invalid
|
||||
profile._obtain_metadata() # to raise ProfileNotExistsException now in case username is invalid
|
||||
return profile
|
||||
|
||||
@classmethod
|
||||
@@ -585,6 +594,17 @@ class Profile:
|
||||
context.profile_id_cache[profile_id] = profile
|
||||
return profile
|
||||
|
||||
@classmethod
|
||||
def own_profile(cls, context: InstaloaderContext):
|
||||
"""Return own profile if logged-in.
|
||||
|
||||
:param context: :attr:`Instaloader.context`
|
||||
|
||||
.. versionadded:: 4.5.2"""
|
||||
if not context.is_logged_in:
|
||||
raise LoginRequiredException("--login required to access own profile.")
|
||||
return cls(context, context.graphql_query("d6f4427fbe92d846298cf93df0b937d3", {})["data"]["user"])
|
||||
|
||||
def _asdict(self):
|
||||
json_node = self._node.copy()
|
||||
# remove posts to avoid "Circular reference detected" exception
|
||||
@@ -808,7 +828,6 @@ class Profile:
|
||||
if self.username != self._context.username:
|
||||
raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username))
|
||||
|
||||
self._obtain_metadata()
|
||||
return NodeIterator(
|
||||
self._context,
|
||||
'f883d95537fbcd400f466f63d42bd8a1',
|
||||
@@ -816,7 +835,6 @@ class Profile:
|
||||
lambda n: Post(self._context, n),
|
||||
{'id': self.userid},
|
||||
'https://www.instagram.com/{0}/'.format(self.username),
|
||||
self._metadata('edge_saved_media'),
|
||||
)
|
||||
|
||||
def get_tagged_posts(self) -> NodeIterator[Post]:
|
||||
@@ -1369,9 +1387,13 @@ class Hashtag:
|
||||
next_other = next(other_posts, None)
|
||||
while next_top is not None or next_other is not None:
|
||||
if next_other is None:
|
||||
assert next_top is not None
|
||||
yield next_top
|
||||
yield from sorted_top_posts
|
||||
break
|
||||
if next_top is None:
|
||||
assert next_other is not None
|
||||
yield next_other
|
||||
yield from other_posts
|
||||
break
|
||||
if next_top == next_other:
|
||||
|
Reference in New Issue
Block a user