From f0bebd0d96ba13304ba95da7b7be2e1076ad615b Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 12 Apr 2018 22:01:26 +0200 Subject: [PATCH] Saving and reimporting of JSON files Metadata JSON files are now created by default. They can later be given as a target to redownload Posts, StoryItems or Profiles with new settings. --- docs/cli-options.rst | 11 ++--- instaloader/__init__.py | 3 +- instaloader/__main__.py | 30 ++++++++++--- instaloader/instaloader.py | 38 +++++++++++------ instaloader/structures.py | 86 ++++++++++++++++++++++++++++++-------- 5 files changed, 127 insertions(+), 41 deletions(-) diff --git a/docs/cli-options.rst b/docs/cli-options.rst index 84dc864..5120465 100644 --- a/docs/cli-options.rst +++ b/docs/cli-options.rst @@ -67,12 +67,13 @@ automatically **finds it by its unique ID** and renames the folder likewise. Also **download stories** of each profile that is downloaded. Requires :option:`--login`. -.. option:: --metadata-json +.. option:: --no-metadata-json - Create a JSON file containing the metadata of each post. This does not - include comments (see :option:`--comments`) nor geotags (see - :option:`--geotags`). The JSON files contain the properties of - :class:`instaloader.Post`. + Do not create a JSON file containing the metadata of each post. + +.. option:: --no-compress-json + + Do not xz compress JSON files, rather create pretty formatted JSONs. .. option:: --stories-only diff --git a/instaloader/__init__.py b/instaloader/__init__.py index eed35c9..20fcc74 100644 --- a/instaloader/__init__.py +++ b/instaloader/__init__.py @@ -14,4 +14,5 @@ else: from .exceptions import * from .instaloader import Instaloader, Tristate -from .structures import Post, Profile, Story, StoryItem, shortcode_to_mediaid, mediaid_to_shortcode +from .structures import (Post, Profile, Story, StoryItem, load_structure_from_file, mediaid_to_shortcode, + save_structure_to_file, shortcode_to_mediaid) diff --git a/instaloader/__main__.py b/instaloader/__main__.py index 8e95899..01eb694 100644 --- a/instaloader/__main__.py +++ b/instaloader/__main__.py @@ -7,7 +7,7 @@ from argparse import ArgumentParser, SUPPRESS from typing import Callable, List, Optional from . import (Instaloader, InstaloaderException, InvalidArgumentException, Post, Profile, ProfileNotExistsException, - Tristate, __version__) + StoryItem, Tristate, __version__, load_structure_from_file) from .instaloader import get_default_session_filename from .instaloadercontext import default_user_agent @@ -86,6 +86,22 @@ def _main(instaloader: Instaloader, targetlist: List[str], try: # Generate set of profiles, already downloading non-profile targets for target in targetlist: + if (target.endswith('.json') or target.endswith('.json.xz')) and os.path.isfile(target): + with instaloader.context.error_catcher(target): + structure = load_structure_from_file(instaloader.context, target) + if isinstance(structure, Post): + instaloader.context.log("Downloading {} ({})".format(structure, target)) + instaloader.download_post(structure, os.path.dirname(target)) + elif isinstance(structure, StoryItem): + instaloader.context.log("Attempting to download {} ({})".format(structure, target)) + instaloader.download_story(structure, os.path.dirname(target)) + elif isinstance(structure, Profile): + instaloader.context.log("Going to download {} ({})".format(structure.username, target)) + profiles.add(structure.username) + else: + raise InvalidArgumentException("{} JSON file not supported as target" + .format(structure.__class__.__name__)) + continue # strip '/' characters to be more shell-autocompletion-friendly target = target.rstrip('/') with instaloader.context.error_catcher(target): @@ -181,9 +197,12 @@ def main(): 'server for each post, which is why it is disabled by default.') g_what.add_argument('--no-captions', action='store_true', help='Do not store media captions, although no additional request is needed to obtain them.') + g_what.add_argument('--no-metadata-json', action='store_true', + help='Do not create a JSON file containing the metadata of each post.') g_what.add_argument('--metadata-json', action='store_true', - help='Create a JSON file containing the metadata of each post. This does not include comments ' - 'nor geotags.') + help=SUPPRESS) + g_what.add_argument('--no-compress-json', action='store_true', + help='Do not xz compress JSON files, rather create pretty formatted JSONs.') g_what.add_argument('-s', '--stories', action='store_true', help='Also download stories of each profile that is downloaded. Requires --login.') g_what.add_argument('--stories-only', action='store_true', @@ -264,7 +283,7 @@ def main(): download_video_thumbnails = Tristate.always if not args.no_video_thumbnails else Tristate.never download_comments = Tristate.always if args.comments else Tristate.no_extra_query save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never - save_metadata = Tristate.always if args.metadata_json else Tristate.never + save_metadata = Tristate.always if not args.no_metadata_json else Tristate.never if args.geotags and args.no_geotags: raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.") @@ -281,7 +300,8 @@ def main(): download_videos=download_videos, download_video_thumbnails=download_video_thumbnails, download_geotags=download_geotags, save_captions=save_captions, download_comments=download_comments, - save_metadata=save_metadata, max_connection_attempts=args.max_connection_attempts) + save_metadata=save_metadata, compress_json=not args.no_compress_json, + max_connection_attempts=args.max_connection_attempts) _main(loader, args.profile, username=args.login.lower() if args.login is not None else None, diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index 7fd9eef..5eb7e72 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -16,7 +16,7 @@ from typing import Any, Callable, Dict, Iterator, List, Optional from .exceptions import * from .instaloadercontext import InstaloaderContext -from .structures import Post, Profile, Story, StoryItem +from .structures import JsonExportable, Post, Profile, Story, StoryItem, save_structure_to_file def get_default_session_filename(username: str) -> str: @@ -83,7 +83,8 @@ class Instaloader: download_geotags: Tristate = Tristate.no_extra_query, save_captions: Tristate = Tristate.no_extra_query, download_comments: Tristate = Tristate.no_extra_query, - save_metadata: Tristate = Tristate.never, + save_metadata: Tristate = Tristate.no_extra_query, + compress_json: bool = True, max_connection_attempts: int = 3): self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts) @@ -108,6 +109,7 @@ class Instaloader: self.save_captions = save_captions self.download_comments = download_comments self.save_metadata = save_metadata + self.compress_json = compress_json @contextmanager def anonymous_copy(self): @@ -118,7 +120,7 @@ class Instaloader: self.download_video_thumbnails, self.download_geotags, self.save_captions, self.download_comments, - self.save_metadata, self.context.max_connection_attempts) + self.save_metadata, self.compress_json, self.context.max_connection_attempts) new_loader.context.previous_queries = self.context.previous_queries yield new_loader self.context.error_log.extend(new_loader.context.error_log) @@ -158,12 +160,16 @@ class Instaloader: os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) return True - def save_metadata_json(self, filename: str, post: Post) -> None: - """Saves metadata JSON file of a :class:`Post`.""" - filename += '.json' - with open(filename, 'w') as fp: - json.dump(post, fp=fp, indent=4, default=Post.json_encoder) - self.context.log('json', end=' ', flush=True) + def save_metadata_json(self, filename: str, structure: JsonExportable) -> None: + """Saves metadata JSON file of a structure.""" + if self.compress_json: + filename += '.json.xz' + else: + filename += '.json' + save_structure_to_file(structure, filename) + if isinstance(structure, (Post, StoryItem)): + # log 'json ' message when saving Post or StoryItem + self.context.log('json', end=' ', flush=True) def update_comments(self, filename: str, post: Post, filename_alt: Optional[str] = None) -> None: try: @@ -393,9 +399,8 @@ class Instaloader: if self.download_comments is Tristate.always: self.update_comments(filename=filename, filename_alt=filename_old, post=post) - # Save metadata as JSON if desired. It might require an extra query, depending on which information has been - # already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query. - if self.save_metadata is Tristate.always: + # Save metadata as JSON if desired. + if self.save_metadata is not Tristate.never: self.save_metadata_json(filename, post) self.context.log() @@ -489,6 +494,9 @@ class Instaloader: filename_alt=filename_old, url=item.video_url, mtime=date_local) + # Save metadata as JSON if desired. + if self.save_metadata is not Tristate.never: + self.save_metadata_json(filename, item) self.context.log() return downloaded @@ -698,6 +706,12 @@ class Instaloader: profile_name = profile.username + # Save metadata as JSON if desired. + if self.save_metadata is not Tristate.never: + json_filename = '{0}/{1}_{2}'.format(self.dirname_pattern.format(profile=profile_name, target=profile_name), + profile_name, profile.userid) + self.save_metadata_json(json_filename, profile) + if self.context.is_logged_in and profile.has_blocked_viewer and not profile.is_private: # raising ProfileNotExistsException invokes "trying again anonymously" logic raise ProfileNotExistsException("Profile {} has blocked you".format(profile_name)) diff --git a/instaloader/structures.py b/instaloader/structures.py index 1769e06..fb89cfb 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -1,8 +1,11 @@ +import json +import lzma import re from base64 import b64decode, b64encode from datetime import datetime -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Union +from . import __version__ from .exceptions import * from .instaloadercontext import InstaloaderContext @@ -65,6 +68,15 @@ class Post: """Create a post object from a given mediaid""" return cls.from_shortcode(context, mediaid_to_shortcode(mediaid)) + def get_node(self): + if self._full_metadata_dict: + node = self._full_metadata_dict + else: + node = self._node + if self._owner_profile: + node['owner'] = self.owner_profile.get_node() + return node + @property def shortcode(self) -> str: """Media shortcode. URL of the post is instagram.com/p//.""" @@ -283,23 +295,6 @@ class Post: params={'__a': 1}) return location_json["location"] if "location" in location_json else location_json['graphql']['location'] - @staticmethod - def json_encoder(obj) -> Dict[str, Any]: - """Convert instance of :class:`Post` to a JSON-serializable dictionary.""" - if not isinstance(obj, Post): - raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__)) - jsondict = {} - for prop in dir(Post): - if prop[0].isupper() or prop[0] == '_': - # skip uppercase and private properties - continue - val = obj.__getattribute__(prop) - if val is True or val is False or isinstance(val, (str, int, float, list)): - jsondict[prop] = val - elif isinstance(val, datetime): - jsondict[prop] = val.isoformat() - return jsondict - class Profile: """ @@ -342,6 +337,14 @@ class Profile: username = Post.from_mediaid(context, int(data['edges'][0]["node"]["id"])).owner_username return cls(context, {'username': username.lower(), 'id': profile_id}) + def get_node(self): + json_node = self._node.copy() + # remove posts + json_node.pop('edge_media_collections', None) + json_node.pop('edge_owner_to_timeline_media', None) + json_node.pop('edge_saved_media', None) + return json_node + def _obtain_metadata(self): try: if not self._rhx_gis: @@ -517,6 +520,12 @@ class StoryItem: self._node = node self._owner_profile = owner_profile + def get_node(self): + node = self._node + if self._owner_profile: + node['owner'] = self._owner_profile.get_node() + return node + @property def mediaid(self) -> int: """The mediaid is a decimal representation of the media shortcode.""" @@ -684,3 +693,44 @@ class Story: def get_items(self) -> Iterator[StoryItem]: """Retrieve all items from a story.""" yield from (StoryItem(self._context, item, self.owner_profile) for item in self._node['items']) + + +JsonExportable = Union[Post, Profile, StoryItem] + + +def save_structure_to_file(structure: JsonExportable, filename: str): + json_structure = {'node': structure.get_node(), + 'instaloader': {'version': __version__, 'node_type': structure.__class__.__name__}} + compress = filename.endswith('.xz') + if compress: + with lzma.open(filename, 'wt', check=lzma.CHECK_NONE) as fp: + json.dump(json_structure, fp=fp, separators=(',', ':')) + else: + with open(filename, 'wt') as fp: + json.dump(json_structure, fp=fp, indent=4, sort_keys=True) + + +def load_structure_from_file(context: InstaloaderContext, filename: str) -> JsonExportable: + compressed = filename.endswith('.xz') + if compressed: + fp = lzma.open(filename, 'rt') + else: + fp = open(filename, 'rt') + json_structure = json.load(fp) + fp.close() + if 'node' in json_structure and 'instaloader' in json_structure and \ + 'node_type' in json_structure['instaloader']: + node_type = json_structure['instaloader']['node_type'] + if node_type == "Post": + return Post(context, json_structure['node']) + elif node_type == "Profile": + return Profile(context, json_structure['node']) + elif node_type == "StoryItem": + return StoryItem(context, json_structure['node']) + else: + raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename)) + elif 'shortcode' in json_structure: + # Post JSON created with Instaloader v3 + return Post.from_shortcode(context, json_structure['shortcode']) + else: + raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))