From e0ed4cf16c156aa4be5429a96fc0dc07d84a5f0a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 29 Sep 2017 14:02:58 +0200 Subject: [PATCH] Save metadata JSON with --metadata-json With --metadata-json, a JSON file for each post is created saving the Post properties defined in instaloader.Post class, i.e. caption, number of likes, people tagged in caption or the picture itself, etc. This closes #33 and closes #47. --- docs/basic-usage.rst | 7 ++++-- docs/cli-options.rst | 7 ++++++ instaloader.py | 58 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/docs/basic-usage.rst b/docs/basic-usage.rst index 273fbba..bd4f3b5 100644 --- a/docs/basic-usage.rst +++ b/docs/basic-usage.rst @@ -70,10 +70,13 @@ Instaloader supports the following targets: Instaloader goes through all media matching the specified targets and downloads the pictures and videos and their captions. You can specify -- :option:`--comments`, to also **download comments** of each post and +- :option:`--comments`, to also **download comments** of each post, - :option:`--geotags`, to **download geotags** of each post and save them as - Google Maps link. + Google Maps link, + +- :option:`--metadata-json`, to store further post metadata in a separate JSON + file. .. _filename-specification: diff --git a/docs/cli-options.rst b/docs/cli-options.rst index a9e10ab..e27cf54 100644 --- a/docs/cli-options.rst +++ b/docs/cli-options.rst @@ -56,6 +56,13 @@ Instead of a *profile* or a *#hashtag*, the special targets Also **download stories** of each profile that is downloaded. Requires :option:`--login`. +.. option:: --metadata-json + + Create a JSON file containing the metadata of each post. This does not + include comments (see :option:`--comments`) nor geotags (see + :option:`--geotags`). The JSON files contain the properties of + :class:`instaloader.Post`. + .. option:: --stories-only Rather than downloading regular posts of each specified profile, only diff --git a/instaloader.py b/instaloader.py index fca3654..d775de4 100755 --- a/instaloader.py +++ b/instaloader.py @@ -170,8 +170,8 @@ class Post: metadata, if required. This class unifies access to the properties associated with a post. It implements == and is hashable. - The properties defined here are accessable by the filter expressions specified with the :option:`--only-if` - parameter. + The properties defined here are accessible by the filter expressions specified with the :option:`--only-if` + parameter and exported into JSON files with :option:`--metadata-json`. """ LOGIN_REQUIRING_PROPERTIES = ["viewer_has_liked"] @@ -361,6 +361,22 @@ class Post: params={'__a': 1}) return location_json["location"] + @staticmethod + def json_encoder(obj) -> Dict[str, Any]: + """Convert instance of :class:`Post` to a JSON-serializable dictionary.""" + if not isinstance(obj, Post): + raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__)) + jsondict = {} + for prop in dir(Post): + if prop[0].isupper() or prop[0] == '_': + # skip uppercase and private properties + continue + val = obj.__getattribute__(prop) + if val is True or val is False or isinstance(val, (str, int, float, list)): + jsondict[prop] = val + elif isinstance(val, datetime): + jsondict[prop] = val.isoformat() + return jsondict class Tristate(Enum): """Tri-state to encode whether we should save certain information, i.e. videos, captions, comments or geotags. @@ -387,8 +403,9 @@ class Instaloader: filename_pattern: Optional[str] = None, download_videos: Tristate = Tristate.always, download_geotags: Tristate = Tristate.no_extra_query, - download_captions: Tristate = Tristate.no_extra_query, - download_comments: Tristate = Tristate.no_extra_query): + save_captions: Tristate = Tristate.no_extra_query, + download_comments: Tristate = Tristate.no_extra_query, + save_metadata: Tristate = Tristate.never): # configuration parameters self.user_agent = user_agent if user_agent is not None else default_user_agent() @@ -401,16 +418,15 @@ class Instaloader: if filename_pattern is not None else '{date:%Y-%m-%d_%H-%M-%S}' self.download_videos = download_videos self.download_geotags = download_geotags - self.download_captions = download_captions + self.save_captions = save_captions self.download_comments = download_comments - self.previous_queries = dict() + self.save_metadata = save_metadata # error log, filled with error() and printed at the end of Instaloader.main() self.error_log = [] # For the adaption of sleep intervals (rate control) - self.request_count = 0 - self.last_request_time = 0 + self.previous_queries = dict() @property def is_logged_in(self) -> bool: @@ -423,7 +439,7 @@ class Instaloader: new_loader = Instaloader(self.sleep, self.quiet, self.user_agent, self.dirname_pattern, self.filename_pattern, self.download_videos, self.download_geotags, - self.download_captions, self.download_comments) + self.save_captions, self.download_comments) new_loader.previous_queries = self.previous_queries yield new_loader self.error_log.extend(new_loader.error_log) @@ -688,6 +704,12 @@ class Instaloader: os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) return True + def save_metadata_json(self, filename: str, post: Post) -> None: + """Saves metadata JSON file of a :class:`Post`.""" + filename += '.json' + json.dump(post, fp=open(filename, 'w'), indent=4, default=Post.json_encoder) + self._log('json', end=' ', flush=True) + def update_comments(self, filename: str, post: Post) -> None: filename += '_comments.json' try: @@ -885,7 +907,7 @@ class Instaloader: downloaded = False # Save caption if desired - if self.download_captions is not Tristate.never: + if self.save_captions is not Tristate.never: if post.caption: self.save_caption(filename, post.date, post.caption) else: @@ -905,6 +927,11 @@ class Instaloader: if self.download_comments is Tristate.always: self.update_comments(filename, post) + # Save metadata as JSON if desired. It might require an extra query, depending on which information has been + # already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query. + if self.save_metadata is Tristate.always: + self.save_metadata_json(filename, post) + self._log() return downloaded @@ -989,7 +1016,7 @@ class Instaloader: self._log("Warning: Unable to find story image.") downloaded = False if "caption" in item and item["caption"] is not None and \ - self.download_captions is not Tristate.never: + self.save_captions is not Tristate.never: caption = item["caption"] if isinstance(caption, dict) and "text" in caption: caption = caption["text"] @@ -1385,6 +1412,9 @@ def main(): 'server for each post, which is why it is disabled by default.') g_what.add_argument('--no-captions', action='store_true', help='Do not store media captions, although no additional request is needed to obtain them.') + g_what.add_argument('--metadata-json', action='store_true', + help='Create a JSON file containing the metadata of each post. This does not include comments ' + 'nor geotags.') g_what.add_argument('-s', '--stories', action='store_true', help='Also download stories of each profile that is downloaded. Requires --login.') g_what.add_argument('--stories-only', action='store_true', @@ -1458,7 +1488,8 @@ def main(): download_videos = Tristate.always if not args.no_videos else Tristate.no_extra_query download_comments = Tristate.always if args.comments else Tristate.no_extra_query - download_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never + save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never + save_metadata = Tristate.always if args.metadata_json else Tristate.never if args.geotags and args.no_geotags: raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.") @@ -1473,7 +1504,8 @@ def main(): user_agent=args.user_agent, dirname_pattern=args.dirname_pattern, filename_pattern=args.filename_pattern, download_videos=download_videos, download_geotags=download_geotags, - download_captions=download_captions, download_comments=download_comments) + save_captions=save_captions, download_comments=download_comments, + save_metadata=save_metadata) loader.main(args.profile, args.login.lower() if args.login is not None else None, args.password, args.sessionfile, int(args.count) if args.count is not None else None,