Saving and reimporting of JSON files

Metadata JSON files are now created by default. They can later be given
as a target to redownload Posts, StoryItems or Profiles with new
settings.
This commit is contained in:
Alexander Graf 2018-04-12 22:01:26 +02:00
parent 5d249c5401
commit f0bebd0d96
5 changed files with 127 additions and 41 deletions

View File

@ -67,12 +67,13 @@ automatically **finds it by its unique ID** and renames the folder likewise.
Also **download stories** of each profile that is downloaded. Requires Also **download stories** of each profile that is downloaded. Requires
:option:`--login`. :option:`--login`.
.. option:: --metadata-json .. option:: --no-metadata-json
Create a JSON file containing the metadata of each post. This does not Do not create a JSON file containing the metadata of each post.
include comments (see :option:`--comments`) nor geotags (see
:option:`--geotags`). The JSON files contain the properties of .. option:: --no-compress-json
:class:`instaloader.Post`.
Do not xz compress JSON files, rather create pretty formatted JSONs.
.. option:: --stories-only .. option:: --stories-only

View File

@ -14,4 +14,5 @@ else:
from .exceptions import * from .exceptions import *
from .instaloader import Instaloader, Tristate from .instaloader import Instaloader, Tristate
from .structures import Post, Profile, Story, StoryItem, shortcode_to_mediaid, mediaid_to_shortcode from .structures import (Post, Profile, Story, StoryItem, load_structure_from_file, mediaid_to_shortcode,
save_structure_to_file, shortcode_to_mediaid)

View File

@ -7,7 +7,7 @@ from argparse import ArgumentParser, SUPPRESS
from typing import Callable, List, Optional from typing import Callable, List, Optional
from . import (Instaloader, InstaloaderException, InvalidArgumentException, Post, Profile, ProfileNotExistsException, from . import (Instaloader, InstaloaderException, InvalidArgumentException, Post, Profile, ProfileNotExistsException,
Tristate, __version__) StoryItem, Tristate, __version__, load_structure_from_file)
from .instaloader import get_default_session_filename from .instaloader import get_default_session_filename
from .instaloadercontext import default_user_agent from .instaloadercontext import default_user_agent
@ -86,6 +86,22 @@ def _main(instaloader: Instaloader, targetlist: List[str],
try: try:
# Generate set of profiles, already downloading non-profile targets # Generate set of profiles, already downloading non-profile targets
for target in targetlist: for target in targetlist:
if (target.endswith('.json') or target.endswith('.json.xz')) and os.path.isfile(target):
with instaloader.context.error_catcher(target):
structure = load_structure_from_file(instaloader.context, target)
if isinstance(structure, Post):
instaloader.context.log("Downloading {} ({})".format(structure, target))
instaloader.download_post(structure, os.path.dirname(target))
elif isinstance(structure, StoryItem):
instaloader.context.log("Attempting to download {} ({})".format(structure, target))
instaloader.download_story(structure, os.path.dirname(target))
elif isinstance(structure, Profile):
instaloader.context.log("Going to download {} ({})".format(structure.username, target))
profiles.add(structure.username)
else:
raise InvalidArgumentException("{} JSON file not supported as target"
.format(structure.__class__.__name__))
continue
# strip '/' characters to be more shell-autocompletion-friendly # strip '/' characters to be more shell-autocompletion-friendly
target = target.rstrip('/') target = target.rstrip('/')
with instaloader.context.error_catcher(target): with instaloader.context.error_catcher(target):
@ -181,9 +197,12 @@ def main():
'server for each post, which is why it is disabled by default.') 'server for each post, which is why it is disabled by default.')
g_what.add_argument('--no-captions', action='store_true', g_what.add_argument('--no-captions', action='store_true',
help='Do not store media captions, although no additional request is needed to obtain them.') help='Do not store media captions, although no additional request is needed to obtain them.')
g_what.add_argument('--no-metadata-json', action='store_true',
help='Do not create a JSON file containing the metadata of each post.')
g_what.add_argument('--metadata-json', action='store_true', g_what.add_argument('--metadata-json', action='store_true',
help='Create a JSON file containing the metadata of each post. This does not include comments ' help=SUPPRESS)
'nor geotags.') g_what.add_argument('--no-compress-json', action='store_true',
help='Do not xz compress JSON files, rather create pretty formatted JSONs.')
g_what.add_argument('-s', '--stories', action='store_true', g_what.add_argument('-s', '--stories', action='store_true',
help='Also download stories of each profile that is downloaded. Requires --login.') help='Also download stories of each profile that is downloaded. Requires --login.')
g_what.add_argument('--stories-only', action='store_true', g_what.add_argument('--stories-only', action='store_true',
@ -264,7 +283,7 @@ def main():
download_video_thumbnails = Tristate.always if not args.no_video_thumbnails else Tristate.never download_video_thumbnails = Tristate.always if not args.no_video_thumbnails else Tristate.never
download_comments = Tristate.always if args.comments else Tristate.no_extra_query download_comments = Tristate.always if args.comments else Tristate.no_extra_query
save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_metadata = Tristate.always if args.metadata_json else Tristate.never save_metadata = Tristate.always if not args.no_metadata_json else Tristate.never
if args.geotags and args.no_geotags: if args.geotags and args.no_geotags:
raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.") raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.")
@ -281,7 +300,8 @@ def main():
download_videos=download_videos, download_video_thumbnails=download_video_thumbnails, download_videos=download_videos, download_video_thumbnails=download_video_thumbnails,
download_geotags=download_geotags, download_geotags=download_geotags,
save_captions=save_captions, download_comments=download_comments, save_captions=save_captions, download_comments=download_comments,
save_metadata=save_metadata, max_connection_attempts=args.max_connection_attempts) save_metadata=save_metadata, compress_json=not args.no_compress_json,
max_connection_attempts=args.max_connection_attempts)
_main(loader, _main(loader,
args.profile, args.profile,
username=args.login.lower() if args.login is not None else None, username=args.login.lower() if args.login is not None else None,

View File

@ -16,7 +16,7 @@ from typing import Any, Callable, Dict, Iterator, List, Optional
from .exceptions import * from .exceptions import *
from .instaloadercontext import InstaloaderContext from .instaloadercontext import InstaloaderContext
from .structures import Post, Profile, Story, StoryItem from .structures import JsonExportable, Post, Profile, Story, StoryItem, save_structure_to_file
def get_default_session_filename(username: str) -> str: def get_default_session_filename(username: str) -> str:
@ -83,7 +83,8 @@ class Instaloader:
download_geotags: Tristate = Tristate.no_extra_query, download_geotags: Tristate = Tristate.no_extra_query,
save_captions: Tristate = Tristate.no_extra_query, save_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query, download_comments: Tristate = Tristate.no_extra_query,
save_metadata: Tristate = Tristate.never, save_metadata: Tristate = Tristate.no_extra_query,
compress_json: bool = True,
max_connection_attempts: int = 3): max_connection_attempts: int = 3):
self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts) self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts)
@ -108,6 +109,7 @@ class Instaloader:
self.save_captions = save_captions self.save_captions = save_captions
self.download_comments = download_comments self.download_comments = download_comments
self.save_metadata = save_metadata self.save_metadata = save_metadata
self.compress_json = compress_json
@contextmanager @contextmanager
def anonymous_copy(self): def anonymous_copy(self):
@ -118,7 +120,7 @@ class Instaloader:
self.download_video_thumbnails, self.download_video_thumbnails,
self.download_geotags, self.download_geotags,
self.save_captions, self.download_comments, self.save_captions, self.download_comments,
self.save_metadata, self.context.max_connection_attempts) self.save_metadata, self.compress_json, self.context.max_connection_attempts)
new_loader.context.previous_queries = self.context.previous_queries new_loader.context.previous_queries = self.context.previous_queries
yield new_loader yield new_loader
self.context.error_log.extend(new_loader.context.error_log) self.context.error_log.extend(new_loader.context.error_log)
@ -158,12 +160,16 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp())) os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
return True return True
def save_metadata_json(self, filename: str, post: Post) -> None: def save_metadata_json(self, filename: str, structure: JsonExportable) -> None:
"""Saves metadata JSON file of a :class:`Post`.""" """Saves metadata JSON file of a structure."""
filename += '.json' if self.compress_json:
with open(filename, 'w') as fp: filename += '.json.xz'
json.dump(post, fp=fp, indent=4, default=Post.json_encoder) else:
self.context.log('json', end=' ', flush=True) filename += '.json'
save_structure_to_file(structure, filename)
if isinstance(structure, (Post, StoryItem)):
# log 'json ' message when saving Post or StoryItem
self.context.log('json', end=' ', flush=True)
def update_comments(self, filename: str, post: Post, filename_alt: Optional[str] = None) -> None: def update_comments(self, filename: str, post: Post, filename_alt: Optional[str] = None) -> None:
try: try:
@ -393,9 +399,8 @@ class Instaloader:
if self.download_comments is Tristate.always: if self.download_comments is Tristate.always:
self.update_comments(filename=filename, filename_alt=filename_old, post=post) self.update_comments(filename=filename, filename_alt=filename_old, post=post)
# Save metadata as JSON if desired. It might require an extra query, depending on which information has been # Save metadata as JSON if desired.
# already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query. if self.save_metadata is not Tristate.never:
if self.save_metadata is Tristate.always:
self.save_metadata_json(filename, post) self.save_metadata_json(filename, post)
self.context.log() self.context.log()
@ -489,6 +494,9 @@ class Instaloader:
filename_alt=filename_old, filename_alt=filename_old,
url=item.video_url, url=item.video_url,
mtime=date_local) mtime=date_local)
# Save metadata as JSON if desired.
if self.save_metadata is not Tristate.never:
self.save_metadata_json(filename, item)
self.context.log() self.context.log()
return downloaded return downloaded
@ -698,6 +706,12 @@ class Instaloader:
profile_name = profile.username profile_name = profile.username
# Save metadata as JSON if desired.
if self.save_metadata is not Tristate.never:
json_filename = '{0}/{1}_{2}'.format(self.dirname_pattern.format(profile=profile_name, target=profile_name),
profile_name, profile.userid)
self.save_metadata_json(json_filename, profile)
if self.context.is_logged_in and profile.has_blocked_viewer and not profile.is_private: if self.context.is_logged_in and profile.has_blocked_viewer and not profile.is_private:
# raising ProfileNotExistsException invokes "trying again anonymously" logic # raising ProfileNotExistsException invokes "trying again anonymously" logic
raise ProfileNotExistsException("Profile {} has blocked you".format(profile_name)) raise ProfileNotExistsException("Profile {} has blocked you".format(profile_name))

View File

@ -1,8 +1,11 @@
import json
import lzma
import re import re
from base64 import b64decode, b64encode from base64 import b64decode, b64encode
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional from typing import Any, Dict, Iterator, List, Optional, Union
from . import __version__
from .exceptions import * from .exceptions import *
from .instaloadercontext import InstaloaderContext from .instaloadercontext import InstaloaderContext
@ -65,6 +68,15 @@ class Post:
"""Create a post object from a given mediaid""" """Create a post object from a given mediaid"""
return cls.from_shortcode(context, mediaid_to_shortcode(mediaid)) return cls.from_shortcode(context, mediaid_to_shortcode(mediaid))
def get_node(self):
if self._full_metadata_dict:
node = self._full_metadata_dict
else:
node = self._node
if self._owner_profile:
node['owner'] = self.owner_profile.get_node()
return node
@property @property
def shortcode(self) -> str: def shortcode(self) -> str:
"""Media shortcode. URL of the post is instagram.com/p/<shortcode>/.""" """Media shortcode. URL of the post is instagram.com/p/<shortcode>/."""
@ -283,23 +295,6 @@ class Post:
params={'__a': 1}) params={'__a': 1})
return location_json["location"] if "location" in location_json else location_json['graphql']['location'] return location_json["location"] if "location" in location_json else location_json['graphql']['location']
@staticmethod
def json_encoder(obj) -> Dict[str, Any]:
"""Convert instance of :class:`Post` to a JSON-serializable dictionary."""
if not isinstance(obj, Post):
raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__))
jsondict = {}
for prop in dir(Post):
if prop[0].isupper() or prop[0] == '_':
# skip uppercase and private properties
continue
val = obj.__getattribute__(prop)
if val is True or val is False or isinstance(val, (str, int, float, list)):
jsondict[prop] = val
elif isinstance(val, datetime):
jsondict[prop] = val.isoformat()
return jsondict
class Profile: class Profile:
""" """
@ -342,6 +337,14 @@ class Profile:
username = Post.from_mediaid(context, int(data['edges'][0]["node"]["id"])).owner_username username = Post.from_mediaid(context, int(data['edges'][0]["node"]["id"])).owner_username
return cls(context, {'username': username.lower(), 'id': profile_id}) return cls(context, {'username': username.lower(), 'id': profile_id})
def get_node(self):
json_node = self._node.copy()
# remove posts
json_node.pop('edge_media_collections', None)
json_node.pop('edge_owner_to_timeline_media', None)
json_node.pop('edge_saved_media', None)
return json_node
def _obtain_metadata(self): def _obtain_metadata(self):
try: try:
if not self._rhx_gis: if not self._rhx_gis:
@ -517,6 +520,12 @@ class StoryItem:
self._node = node self._node = node
self._owner_profile = owner_profile self._owner_profile = owner_profile
def get_node(self):
node = self._node
if self._owner_profile:
node['owner'] = self._owner_profile.get_node()
return node
@property @property
def mediaid(self) -> int: def mediaid(self) -> int:
"""The mediaid is a decimal representation of the media shortcode.""" """The mediaid is a decimal representation of the media shortcode."""
@ -684,3 +693,44 @@ class Story:
def get_items(self) -> Iterator[StoryItem]: def get_items(self) -> Iterator[StoryItem]:
"""Retrieve all items from a story.""" """Retrieve all items from a story."""
yield from (StoryItem(self._context, item, self.owner_profile) for item in self._node['items']) yield from (StoryItem(self._context, item, self.owner_profile) for item in self._node['items'])
JsonExportable = Union[Post, Profile, StoryItem]
def save_structure_to_file(structure: JsonExportable, filename: str):
json_structure = {'node': structure.get_node(),
'instaloader': {'version': __version__, 'node_type': structure.__class__.__name__}}
compress = filename.endswith('.xz')
if compress:
with lzma.open(filename, 'wt', check=lzma.CHECK_NONE) as fp:
json.dump(json_structure, fp=fp, separators=(',', ':'))
else:
with open(filename, 'wt') as fp:
json.dump(json_structure, fp=fp, indent=4, sort_keys=True)
def load_structure_from_file(context: InstaloaderContext, filename: str) -> JsonExportable:
compressed = filename.endswith('.xz')
if compressed:
fp = lzma.open(filename, 'rt')
else:
fp = open(filename, 'rt')
json_structure = json.load(fp)
fp.close()
if 'node' in json_structure and 'instaloader' in json_structure and \
'node_type' in json_structure['instaloader']:
node_type = json_structure['instaloader']['node_type']
if node_type == "Post":
return Post(context, json_structure['node'])
elif node_type == "Profile":
return Profile(context, json_structure['node'])
elif node_type == "StoryItem":
return StoryItem(context, json_structure['node'])
else:
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))
elif 'shortcode' in json_structure:
# Post JSON created with Instaloader v3
return Post.from_shortcode(context, json_structure['shortcode'])
else:
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))