From 169ce1a300c7b88a545e36c5551cdcb62168e7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Koch-Kramer?= Date: Thu, 20 Jul 2017 22:30:12 +0200 Subject: [PATCH] Download comments Close #5 --- .travis.yml | 2 +- README.rst | 3 ++ instaloader.py | 105 ++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 94 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index d3201cd..50562cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,4 +9,4 @@ python: install: - pip install pylint requests script: - - python3 -m pylint -r n -d bad-whitespace,missing-docstring,too-many-arguments,locally-disabled,line-too-long,too-many-public-methods instaloader + - python3 -m pylint -r n -d bad-whitespace,missing-docstring,too-many-arguments,locally-disabled,line-too-long,too-many-public-methods,too-many-lines instaloader diff --git a/README.rst b/README.rst index 1b1aabf..974c61e 100644 --- a/README.rst +++ b/README.rst @@ -118,6 +118,9 @@ renames the folder likewise. link. This requires an additional request to the Instagram server for each picture, which is why it is disabled by default. +--comments Download and update comments for each post. This + requires an additional request to the Instagram server + for each post, which is why it is disabled by default. When to Stop Downloading ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/instaloader.py b/instaloader.py index efab5e8..6be9251 100755 --- a/instaloader.py +++ b/instaloader.py @@ -249,7 +249,7 @@ class Instaloader: if page_info['has_next_page']: resp = tmpsession.get('https://www.instagram.com/graphql/query/', params={'query_id': 17851374694183129, - 'variables': '{"id":"' + str(profile_id) + '","first":500},"after":"' + + 'variables': '{"id":"' + str(profile_id) + '","first":500,"after":"' + page_info['end_cursor'] + '"}'}) data = resp.json() else: @@ -290,7 +290,7 @@ class Instaloader: if page_info['has_next_page']: resp = tmpsession.get('https://www.instagram.com/graphql/query/', params={'query_id': 17874545323001329, - 'variables': '{"id":"' + str(profile_id) + '","first":500},"after":"' + + 'variables': '{"id":"' + str(profile_id) + '","first":500,"after":"' + page_info['end_cursor'] + '"}'}) data = resp.json() else: @@ -299,6 +299,38 @@ class Instaloader: raise ConnectionException("ConnectionError({0}): unable to gather followees.".format(resp.status_code)) return followees + def get_comments(self, shortcode: str) -> List[Dict[str, Any]]: + tmpsession = copy_session(self.session) + header = self.default_http_header(empty_session_only=True) + del header['Connection'] + del header['Content-Length'] + header['authority'] = 'www.instagram.com' + header['scheme'] = 'https' + header['accept'] = '*/*' + header['referer'] = 'https://www.instagram.com/p/' + shortcode + '/' + tmpsession.headers = header + resp = tmpsession.get('https://www.instagram.com/graphql/query/', + params={'query_id': 17852405266163336, + 'variables': '{"shortcode":"' + shortcode + '","first":500}'}) + if resp.status_code == 200: + data = resp.json() + comments = [] + while True: + edge_media_to_comment = data['data']['shortcode_media']['edge_media_to_comment'] + comments.extend([comment['node'] for comment in edge_media_to_comment['edges']]) + page_info = edge_media_to_comment['page_info'] + if page_info['has_next_page']: + resp = tmpsession.get('https://www.instagram.com/graphql/query/', + params={'query_id': 17852405266163336, + 'variables': '{"shortcode":"' + shortcode + '","first":500,"after":"' + + page_info['end_cursor'] + '"}'}) + data = resp.json() + else: + break + else: + raise ConnectionException("ConnectionError({0}): unable to gather comments.".format(resp.status_code)) + return comments + def download_pic(self, name: str, url: str, date_epoch: float, outputlabel: Optional[str] = None, filename_suffix: Optional[str] = None) -> bool: """Downloads and saves picture with given url under given directory with given timestamp. @@ -330,6 +362,32 @@ class Instaloader: else: raise ConnectionException("File \'" + url + "\' could not be downloaded.") + def update_comments(self, name: str, shortcode: str, date_epoch: float) -> None: + if self.profile_subdirs: + filename = name.lower() + '/' + _epoch_to_string(date_epoch) + '_comments.json' + else: + filename = name.lower() + '__' + _epoch_to_string(date_epoch) + '_comments.json' + try: + comments = json.load(open(filename)) + except FileNotFoundError: + comments = list() + comments.extend(self.get_comments(shortcode)) + if comments: + with open(filename, 'w') as file: + comments_list = sorted(sorted(list(comments), key=lambda t: t['id']), + key=lambda t: t['created_at'], reverse=True) + unique_comments_list = [comments_list[0]] + #for comment in comments_list: + # if unique_comments_list[-1]['id'] != comment['id']: + # unique_comments_list.append(comment) + #file.write(json.dumps(unique_comments_list, indent=4)) + #pylint:disable=invalid-name + for x, y in zip(comments_list[:-1], comments_list[1:]): + if x['id'] != y['id']: + unique_comments_list.append(y) + file.write(json.dumps(unique_comments_list, indent=4)) + self._log('comments', end=' ', flush=True) + def save_caption(self, name: str, date_epoch: float, caption: str) -> None: """Updates picture caption""" # pylint:disable=too-many-branches @@ -543,7 +601,7 @@ class Instaloader: return location_json["entry_data"]["LocationsPage"][0]["location"] def download_node(self, node: Dict[str, Any], name: str, - download_videos: bool = True, geotags: bool = False) -> bool: + download_videos: bool = True, geotags: bool = False, download_comments: bool = False) -> bool: """ Download everything associated with one instagram node, i.e. picture, caption and video. @@ -551,6 +609,7 @@ class Instaloader: :param name: Name of profile to which this node belongs :param download_videos: True, if videos should be downloaded :param geotags: Download geotags + :param download_comments: Update comments :return: True if something was downloaded, False otherwise, i.e. file was already there """ # pylint:disable=too-many-branches,too-many-locals @@ -601,12 +660,15 @@ class Instaloader: location = self.get_location(node_code) if location: self.save_location(name, location, date) + if download_comments: + self.update_comments(name, node_code, date) self._log() return downloaded def download_feed_pics(self, max_count: int = None, fast_update: bool = False, filter_func: Optional[Callable[[Dict[str, Dict[str, Any]]], bool]] = None, - download_videos: bool = True, geotags: bool = False) -> None: + download_videos: bool = True, geotags: bool = False, + download_comments: bool = False) -> None: """ Download pictures from the user's feed. @@ -624,6 +686,7 @@ class Instaloader: :param filter_func: function(node), which returns True if given picture should not be downloaded :param download_videos: True, if videos should be downloaded :param geotags: Download geotags + :param download_comments: Update comments """ # pylint:disable=too-many-locals data = self.get_feed_json() @@ -646,7 +709,8 @@ class Instaloader: self._log("[%3i] %s " % (count, name), end="", flush=True) count += 1 downloaded = self.download_node(node, name, - download_videos=download_videos, geotags=geotags) + download_videos=download_videos, geotags=geotags, + download_comments=download_comments) if fast_update and not downloaded: return if not feed["page_info"]["has_next_page"]: @@ -662,6 +726,7 @@ class Instaloader: max_count: Optional[int] = None, filter_func: Optional[Callable[[Dict[str, Dict[str, Any]]], bool]] = None, fast_update: bool = False, download_videos: bool = True, geotags: bool = False, + download_comments: bool = False, lookup_username: bool = False) -> None: """Download pictures of one hashtag. @@ -675,6 +740,7 @@ class Instaloader: :param fast_update: If true, abort when first already-downloaded picture is encountered :param download_videos: True, if videos should be downloaded :param geotags: Download geotags + :param download_comments: Update comments :param lookup_username: Lookup username to encode it in the downloaded file's path, rather than the hashtag """ data = self.get_hashtag_json(hashtag) @@ -694,7 +760,8 @@ class Instaloader: continue count += 1 downloaded = self.download_node(node, pathname, - download_videos=download_videos, geotags=geotags) + download_videos=download_videos, geotags=geotags, + download_comments=download_comments) if fast_update and not downloaded: return if data['entry_data']['TagPage'][0]['tag']['media']['page_info']['has_next_page']: @@ -742,7 +809,7 @@ class Instaloader: def download(self, name: str, profile_pic_only: bool = False, download_videos: bool = True, geotags: bool = False, - fast_update: bool = False) -> None: + download_comments: bool = False, fast_update: bool = False) -> None: """Download one profile""" # pylint:disable=too-many-branches,too-many-locals # Get profile main page json @@ -785,7 +852,8 @@ class Instaloader: self._log("[%3i/%3i] " % (count, totalcount), end="", flush=True) count += 1 downloaded = self.download_node(node, name, - download_videos=download_videos, geotags=geotags) + download_videos=download_videos, geotags=geotags, + download_comments=download_comments) if fast_update and not downloaded: return data = self.get_json(name, max_id=get_last_id(data)) @@ -808,6 +876,7 @@ class Instaloader: def download_profiles(self, profilelist: List[str], username: Optional[str] = None, password: Optional[str] = None, sessionfile: Optional[str] = None, max_count: Optional[int] = None, profile_pic_only: bool = False, download_videos: bool = True, geotags: bool = False, + download_comments: bool = False, fast_update: bool = False, hashtag_lookup_username: bool = False) -> None: """Download set of profiles and handle sessions""" # pylint:disable=too-many-branches,too-many-locals,too-many-statements @@ -835,7 +904,7 @@ class Instaloader: self._log("Retrieving pictures with hashtag {0}".format(pentry)) self.download_hashtag(hashtag=pentry[1:], max_count=max_count, fast_update=fast_update, download_videos=download_videos, geotags=geotags, - lookup_username=hashtag_lookup_username) + download_comments=download_comments, lookup_username=hashtag_lookup_username) elif pentry[0] == '@': if username is not None: self._log("Retrieving followees of %s..." % pentry[1:]) @@ -847,7 +916,8 @@ class Instaloader: if username is not None: self._log("Retrieving pictures from your feed...") self.download_feed_pics(fast_update=fast_update, max_count=max_count, - download_videos=download_videos, geotags=geotags) + download_videos=download_videos, geotags=geotags, + download_comments=download_comments) else: print("--login=USERNAME required to download {}.".format(pentry), file=sys.stderr) elif pentry == ":feed-liked": @@ -858,7 +928,8 @@ class Instaloader: not node["likes"]["viewer_has_liked"] if "likes" in node else not node["viewer_has_liked"], - download_videos=download_videos, geotags=geotags) + download_videos=download_videos, geotags=geotags, + download_comments=download_comments) else: print("--login=USERNAME required to download {}.".format(pentry), file=sys.stderr) else: @@ -870,7 +941,7 @@ class Instaloader: try: try: self.download(target, profile_pic_only, download_videos, - geotags, fast_update) + geotags, download_comments, fast_update) except ProfileNotExistsException as err: if username is not None: self._log(err) @@ -878,7 +949,7 @@ class Instaloader: anonymous_loader = Instaloader(self.sleep, self.quiet, self.shorter_output, self.profile_subdirs, self.user_agent) anonymous_loader.download(target, profile_pic_only, download_videos, - geotags, fast_update) + geotags, download_comments, fast_update) else: raise err except NonfatalException as err: @@ -920,6 +991,10 @@ def main(): 'text file with the location\'s name and a Google Maps link. ' 'This requires an additional request to the Instagram ' 'server for each picture, which is why it is disabled by default.') + g_what.add_argument('-C', '--comments', action='store_true', + help='Download and update comments for each post. ' + 'This requires an additional request to the Instagram ' + 'server for each post, which is why it is disabled by default.') g_stop = parser.add_argument_group('When to Stop Downloading', 'If none of these options are given, Instaloader goes through all pictures ' @@ -978,8 +1053,8 @@ def main(): profile_subdirs=not args.no_profile_subdir, user_agent=args.user_agent) loader.download_profiles(args.profile, args.login, args.password, args.sessionfile, int(args.count) if args.count is not None else None, - args.profile_pic_only, not args.skip_videos, args.geotags, args.fast_update, - args.hashtag_username) + args.profile_pic_only, not args.skip_videos, args.geotags, args.download_comments, + args.fast_update, args.hashtag_username) except InstaloaderException as err: raise SystemExit("Fatal error: %s" % err)