From e25eb2a948d853d705a66ee1483812fcb4e0130d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Koch-Kramer?= Date: Fri, 20 Apr 2018 12:59:41 +0200 Subject: [PATCH] Limit GraphQl queries to 20 per 11 minutes cherry-picked from commit d90c05e0a44cb0c41b98436ac18bfd1955a0a9ce - Set GRAPHQL_PAGE_LENGTH to 50 what appears to be the new working maximum. - Limit GQL queries to 20 per 666 seconds. - Remove logic for tracking queries per query identifier as Instagram only allows 20 overall GQL queries per sliding window. Related to #101 --- instaloader/instaloader.py | 4 ++-- instaloader/instaloadercontext.py | 26 +++++++++++--------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index 94b1c11..6064203 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -111,11 +111,11 @@ class Instaloader: self.download_geotags, self.save_captions, self.download_comments, self.save_metadata, self.compress_json, self.post_metadata_txt_pattern, self.storyitem_metadata_txt_pattern, self.context.max_connection_attempts) - new_loader.context.previous_queries = self.context.previous_queries + new_loader.context.query_timestamps = self.context.query_timestamps yield new_loader self.context.error_log.extend(new_loader.context.error_log) new_loader.context.error_log = [] # avoid double-printing of errors - self.context.previous_queries = new_loader.context.previous_queries + self.context.query_timestamps = new_loader.context.query_timestamps new_loader.close() def close(self): diff --git a/instaloader/instaloadercontext.py b/instaloader/instaloadercontext.py index d3725ae..f08aa52 100644 --- a/instaloader/instaloadercontext.py +++ b/instaloader/instaloadercontext.py @@ -61,7 +61,7 @@ class InstaloaderContext: self.error_log = [] # For the adaption of sleep intervals (rate control) - self.previous_queries = dict() + self.query_timestamps = list() # Can be set to True for testing, disables supression of InstaloaderContext._error_catcher self.raise_all_errors = False @@ -195,29 +195,25 @@ class InstaloaderContext: :raises QueryReturnedNotFoundException: When the server responds with a 404. :raises ConnectionException: When query repeatedly failed. """ - def graphql_query_waittime(query_hash: str, untracked_queries: bool = False) -> int: + def graphql_query_waittime(untracked_queries: bool = False) -> int: sliding_window = 660 - timestamps = self.previous_queries.get(query_hash) - if not timestamps: + if not self.query_timestamps: return sliding_window if untracked_queries else 0 current_time = time.monotonic() - timestamps = list(filter(lambda t: t > current_time - sliding_window, timestamps)) - self.previous_queries[query_hash] = timestamps - if len(timestamps) < 100 and not untracked_queries: + self.query_timestamps = list(filter(lambda t: t > current_time - sliding_window, self.query_timestamps)) + if len(self.query_timestamps) < 20 and not untracked_queries: return 0 - return round(min(timestamps) + sliding_window - current_time) + 6 + return round(min(self.query_timestamps) + sliding_window - current_time) + 6 is_graphql_query = 'query_hash' in params and 'graphql/query' in path if is_graphql_query: - query_hash = params['query_hash'] - waittime = graphql_query_waittime(query_hash) + waittime = graphql_query_waittime() if waittime > 0: self.log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime)) time.sleep(waittime) - timestamp_list = self.previous_queries.get(query_hash) - if timestamp_list is not None: - timestamp_list.append(time.monotonic()) + if self.query_timestamps is not None: + self.query_timestamps.append(time.monotonic()) else: - self.previous_queries[query_hash] = [time.monotonic()] + self.query_timestamps = [time.monotonic()] sess = session if session else self._session try: self._sleep() @@ -265,7 +261,7 @@ class InstaloaderContext: if isinstance(err, TooManyRequestsException): print(textwrap.fill(text_for_429), file=sys.stderr) if is_graphql_query: - waittime = graphql_query_waittime(query_hash=params['query_hash'], untracked_queries=True) + waittime = graphql_query_waittime(untracked_queries=True) if waittime > 0: self.log('The request will be retried in {} seconds.'.format(waittime)) time.sleep(waittime)