Merge branch 'master' into upcoming/v4.6

2020-11-28 19:00:49 +01:00
parent e11b88d44b e9648bc551
commit 4c02a186d3
12 changed files with 274 additions and 68 deletions
--- a/instaloader/init.py
+++ b/instaloader/init.py
@@ -1,7 +1,7 @@
 """Download pictures (or videos) along with their captions and other metadata from Instagram."""


-__version__ = '4.5.1'
+__version__ = '4.5.4'


 try:
--- a/instaloader/instaloader.py
+++ b/instaloader/instaloader.py
@@ -514,15 +514,16 @@ class Instaloader:
        # Download the image(s) / video thumbnail and videos within sidecars if desired
        downloaded = True
        if post.typename == 'GraphSidecar':
-            for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
-                if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
-                    # Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
-                    downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
-                                                    mtime=post.date_local, filename_suffix=str(edge_number))
-                if sidecar_node.is_video and self.download_videos:
-                    # Download sidecar video if desired
-                    downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
-                                                    mtime=post.date_local, filename_suffix=str(edge_number))
+            if self.download_pictures or self.download_videos:
+                for edge_number, sidecar_node in enumerate(post.get_sidecar_nodes(), start=1):
+                    if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails):
+                        # Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails)
+                        downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url,
+                                                        mtime=post.date_local, filename_suffix=str(edge_number))
+                    if sidecar_node.is_video and self.download_videos:
+                        # Download sidecar video if desired
+                        downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url,
+                                                        mtime=post.date_local, filename_suffix=str(edge_number))
        elif post.typename == 'GraphImage':
            # Download picture
            if self.download_pictures:
@@ -578,12 +579,12 @@ class Instaloader:

        def _userid_chunks():
            assert userids is not None
-            userids_per_query = 100
+            userids_per_query = 50
            for i in range(0, len(userids), userids_per_query):
                yield userids[i:i + userids_per_query]

        for userid_chunk in _userid_chunks():
-            stories = self.context.graphql_query("bf41e22b1c4ba4c9f31b844ebb7d9056",
+            stories = self.context.graphql_query("303a4ae99711322310f25250d988f3b7",
                                                 {"reel_ids": userid_chunk, "precomposed_overlay": False})["data"]
            yield from (Story(self.context, media) for media in stories['reels_media'])

@@ -856,7 +857,7 @@ class Instaloader:
        """
        self.context.log("Retrieving saved posts...")
        assert self.context.username is not None  # safe due to @_requires_login; required by typechecker
-        node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts()
+        node_iterator = Profile.own_profile(self.context).get_saved_posts()
        self.posts_download_loop(node_iterator, ":saved",
                                 fast_update, post_filter,
                                 max_count=max_count, total_count=node_iterator.count)
--- a/instaloader/instaloadercontext.py
+++ b/instaloader/instaloadercontext.py
@@ -347,10 +347,10 @@ class InstaloaderContext:
                    raise ConnectionException("\"window._sharedData\" does not contain required keys.")
                # If GraphQL data is missing in `window._sharedData`, search for it in `__additionalDataLoaded`.
                if 'graphql' not in post_or_profile_page[0]:
-                    match = re.search(r'window\.__additionalDataLoaded\([^{]+{"graphql":({.*})}\);</script>',
+                    match = re.search(r'window\.__additionalDataLoaded\(.*?({.*"graphql":.*})\);</script>',
                                      resp.text)
                    if match is not None:
-                        post_or_profile_page[0]['graphql'] = json.loads(match.group(1))
+                        post_or_profile_page[0]['graphql'] = json.loads(match.group(1))['graphql']
                return resp_json
            else:
                resp_json = resp.json()
@@ -545,8 +545,8 @@ class RateController:

    def __init__(self, context: InstaloaderContext):
        self._context = context
-        self._graphql_query_timestamps = dict()  # type: Dict[str, List[float]]
-        self._graphql_earliest_next_request_time = 0.0
+        self._query_timestamps = dict()  # type: Dict[str, List[float]]
+        self._earliest_next_request_time = 0.0

    def sleep(self, secs: float):
        """Wait given number of seconds."""
@@ -556,11 +556,11 @@ class RateController:
        time.sleep(secs)

    def _dump_query_timestamps(self, current_time: float, failed_query_type: str):
-        windows = [10, 11, 15, 20, 30, 60]
+        windows = [10, 11, 20, 22, 30, 60]
        self._context.error("Requests within last {} minutes grouped by type:"
                            .format('/'.join(str(w) for w in windows)),
                            repeat_at_end=False)
-        for query_type, times in self._graphql_query_timestamps.items():
+        for query_type, times in self._query_timestamps.items():
            reqs_in_sliding_window = [sum(t > current_time - w * 60 for t in times) for w in windows]
            self._context.error(" {} {:>32}: {}".format(
                "*" if query_type == failed_query_type else " ",
@@ -569,28 +569,61 @@ class RateController:
            ), repeat_at_end=False)

    def count_per_sliding_window(self, query_type: str) -> int:
-        """Return how many GraphQL requests can be done within the sliding window."""
+        """Return how many requests can be done within the sliding window."""
        # Not static, to allow for the count_per_sliding_window to depend on context-inherent properties, such as
        # whether we are logged in.
-        # pylint:disable=no-self-use,unused-argument
-        return 200
+        # pylint:disable=no-self-use
+        return 75 if query_type in ['iphone', 'other'] else 200
+
+    def _reqs_in_sliding_window(self, query_type: Optional[str], current_time: float, window: float) -> List[float]:
+        if query_type is not None:
+            # timestamps of type query_type
+            relevant_timestamps = self._query_timestamps[query_type]
+        else:
+            # all GraphQL queries, i.e. not 'iphone' or 'other'
+            graphql_query_timestamps = filter(lambda tp: tp[0] not in ['iphone', 'other'],
+                                              self._query_timestamps.items())
+            relevant_timestamps = [t for times in (tp[1] for tp in graphql_query_timestamps) for t in times]
+        return list(filter(lambda t: t > current_time - window, relevant_timestamps))

    def query_waittime(self, query_type: str, current_time: float, untracked_queries: bool = False) -> float:
-        """Calculate time needed to wait before GraphQL query can be executed."""
-        sliding_window = 660
-        if query_type not in self._graphql_query_timestamps:
-            self._graphql_query_timestamps[query_type] = []
-        self._graphql_query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
-                                                                 self._graphql_query_timestamps[query_type]))
-        reqs_in_sliding_window = list(filter(lambda t: t > current_time - sliding_window,
-                                             self._graphql_query_timestamps[query_type]))
-        count_per_sliding_window = self.count_per_sliding_window(query_type)
-        if len(reqs_in_sliding_window) < count_per_sliding_window and not untracked_queries:
-            return max(0.0, self._graphql_earliest_next_request_time - current_time)
-        next_request_time = min(reqs_in_sliding_window) + sliding_window + 6
-        if untracked_queries:
-            self._graphql_earliest_next_request_time = next_request_time
-        return max(next_request_time, self._graphql_earliest_next_request_time) - current_time
+        """Calculate time needed to wait before query can be executed."""
+        per_type_sliding_window = 660
+        if query_type not in self._query_timestamps:
+            self._query_timestamps[query_type] = []
+        self._query_timestamps[query_type] = list(filter(lambda t: t > current_time - 60 * 60,
+                                                         self._query_timestamps[query_type]))
+
+        def per_type_next_request_time():
+            reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
+            if len(reqs_in_sliding_window) < self.count_per_sliding_window(query_type):
+                return 0.0
+            else:
+                return min(reqs_in_sliding_window) + per_type_sliding_window + 6
+
+        def gql_accumulated_next_request_time():
+            if query_type in ['iphone', 'other']:
+                return 0.0
+            gql_accumulated_sliding_window = 600
+            gql_accumulated_max_count = 275
+            reqs_in_sliding_window = self._reqs_in_sliding_window(None, current_time, gql_accumulated_sliding_window)
+            if len(reqs_in_sliding_window) < gql_accumulated_max_count:
+                return 0.0
+            else:
+                return min(reqs_in_sliding_window) + gql_accumulated_sliding_window
+
+        def untracked_next_request_time():
+            if untracked_queries:
+                reqs_in_sliding_window = self._reqs_in_sliding_window(query_type, current_time, per_type_sliding_window)
+                self._earliest_next_request_time = min(reqs_in_sliding_window) + per_type_sliding_window + 6
+            return self._earliest_next_request_time
+
+        return max(0.0,
+                   max(
+                       per_type_next_request_time(),
+                       gql_accumulated_next_request_time(),
+                       untracked_next_request_time(),
+                   ) - current_time)

    def wait_before_query(self, query_type: str) -> None:
        """This method is called before a query to Instagram. It calls :meth:`RateController.sleep` to wait
@@ -602,10 +635,10 @@ class RateController:
                              .format(round(waittime), datetime.now() + timedelta(seconds=waittime)))
        if waittime > 0:
            self.sleep(waittime)
-        if query_type not in self._graphql_query_timestamps:
-            self._graphql_query_timestamps[query_type] = [time.monotonic()]
+        if query_type not in self._query_timestamps:
+            self._query_timestamps[query_type] = [time.monotonic()]
        else:
-            self._graphql_query_timestamps[query_type].append(time.monotonic())
+            self._query_timestamps[query_type].append(time.monotonic())

    def handle_429(self, query_type: str) -> None:
        """This method is called to handle a 429 Too Many Requests response. It calls :meth:`RateController.sleep` to
--- a/instaloader/nodeiterator.py
+++ b/instaloader/nodeiterator.py
@@ -81,11 +81,13 @@ class NodeIterator(Iterator[T]):
        self._node_wrapper = node_wrapper
        self._query_variables = query_variables if query_variables is not None else {}
        self._query_referer = query_referer
-        self._data = first_data
        self._page_index = 0
        self._total_index = 0
-        self._best_before = (None if first_data is None else
-                             datetime.now() + NodeIterator._shelf_life)
+        if first_data is not None:
+            self._data = first_data
+            self._best_before = datetime.now() + NodeIterator._shelf_life
+        else:
+            self._data = self._query()

    def _query(self, after: Optional[str] = None) -> Dict:
        pagination_variables = {'first': NodeIterator._graphql_page_length}  # type: Dict[str, Any]
@@ -113,8 +115,6 @@ class NodeIterator(Iterator[T]):
        return self

    def __next__(self) -> T:
-        if self._data is None:
-            self._data = self._query()
        if self._page_index < len(self._data['edges']):
            node = self._data['edges'][self._page_index]['node']
            page_index, total_index = self._page_index, self._total_index
@@ -193,8 +193,12 @@ class NodeIterator(Iterator[T]):
                self._query_referer != frozen.query_referer or
                self._context.username != frozen.context_username):
            raise InvalidArgumentException("Mismatching resume information.")
+        if not frozen.best_before:
+            raise InvalidArgumentException("\"best before\" date missing.")
+        if frozen.remaining_data is None:
+            raise InvalidArgumentException("\"remaining_data\" missing.")
        self._total_index = frozen.total_index
-        self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None
+        self._best_before = datetime.fromtimestamp(frozen.best_before)
        self._data = frozen.remaining_data


--- a/instaloader/structures.py
+++ b/instaloader/structures.py
@@ -147,10 +147,6 @@ class Post:
            )
            self._full_metadata_dict = pic_json['data']['shortcode_media']
            if self._full_metadata_dict is None:
-                # issue #449
-                self._context.error("Fetching Post metadata failed (issue #449). "
-                                    "The following data has been returned:\n"
-                                    + json.dumps(pic_json['entry_data'], indent=2))
                raise BadResponseException("Fetching Post metadata failed.")
            if self.shortcode != self._full_metadata_dict['shortcode']:
                self._node.update(self._full_metadata_dict)
@@ -207,7 +203,13 @@ class Post:
    @property
    def owner_id(self) -> int:
        """The ID of the Post's owner."""
-        return self.owner_profile.userid
+        # The ID may already be available, e.g. if the post instance was created
+        # from an `hashtag.get_posts()` iterator, so no need to make another
+        # http request.
+        if 'owner' in self._node and 'id' in self._node['owner']:
+            return self._node['owner']['id']
+        else:
+            return self.owner_profile.userid

    @property
    def date_local(self) -> datetime:
@@ -441,7 +443,14 @@ class Post:
        )

    def get_likes(self) -> Iterator['Profile']:
-        """Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded."""
+        """
+        Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded.
+
+        .. versionchanged:: 4.5.4
+           Require being logged in (as required by Instagram).
+        """
+        if not self._context.is_logged_in:
+            raise LoginRequiredException("--login required to access likes of a post.")
        if self.likes == 0:
            # Avoid doing additional requests if there are no comments
            return
@@ -555,7 +564,7 @@ class Profile:
        """
        # pylint:disable=protected-access
        profile = cls(context, {'username': username.lower()})
-        profile._obtain_metadata()  # to raise ProfileNotExistException now in case username is invalid
+        profile._obtain_metadata()  # to raise ProfileNotExistsException now in case username is invalid
        return profile

    @classmethod
@@ -585,6 +594,17 @@ class Profile:
        context.profile_id_cache[profile_id] = profile
        return profile

+    @classmethod
+    def own_profile(cls, context: InstaloaderContext):
+        """Return own profile if logged-in.
+
+        :param context: :attr:`Instaloader.context`
+
+        .. versionadded:: 4.5.2"""
+        if not context.is_logged_in:
+            raise LoginRequiredException("--login required to access own profile.")
+        return cls(context, context.graphql_query("d6f4427fbe92d846298cf93df0b937d3", {})["data"]["user"])
+
    def _asdict(self):
        json_node = self._node.copy()
        # remove posts to avoid "Circular reference detected" exception
@@ -808,7 +828,6 @@ class Profile:
        if self.username != self._context.username:
            raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username))

-        self._obtain_metadata()
        return NodeIterator(
            self._context,
            'f883d95537fbcd400f466f63d42bd8a1',
@@ -816,7 +835,6 @@ class Profile:
            lambda n: Post(self._context, n),
            {'id': self.userid},
            'https://www.instagram.com/{0}/'.format(self.username),
-            self._metadata('edge_saved_media'),
        )

    def get_tagged_posts(self) -> NodeIterator[Post]:
@@ -1369,9 +1387,13 @@ class Hashtag:
        next_other = next(other_posts, None)
        while next_top is not None or next_other is not None:
            if next_other is None:
+                assert next_top is not None
+                yield next_top
                yield from sorted_top_posts
                break
            if next_top is None:
+                assert next_other is not None
+                yield next_other
                yield from other_posts
                break
            if next_top == next_other: