Fix download of hashtags and locations

Fixes #1080, fixes #1129, closes #1240.
This commit is contained in:
Alexander Graf 2021-11-12 20:17:24 +01:00
parent d6fd4c560c
commit 5d18857695
3 changed files with 165 additions and 38 deletions

View File

@ -22,6 +22,7 @@ from .exceptions import *
from .instaloadercontext import InstaloaderContext, RateController from .instaloadercontext import InstaloaderContext, RateController
from .lateststamps import LatestStamps from .lateststamps import LatestStamps
from .nodeiterator import NodeIterator, resumable_iteration from .nodeiterator import NodeIterator, resumable_iteration
from .sectioniterator import SectionIterator
from .structures import (Hashtag, Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem, from .structures import (Hashtag, Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem,
load_structure_from_file, save_structure_to_file, PostSidecarNode, TitlePic) load_structure_from_file, save_structure_to_file, PostSidecarNode, TitlePic)
@ -1088,18 +1089,12 @@ class Instaloader:
.. versionchanged:: 4.2.9 .. versionchanged:: 4.2.9
Require being logged in (as required by Instagram) Require being logged in (as required by Instagram)
""" """
has_next_page = True yield from SectionIterator(
end_cursor = None self.context,
while has_next_page: lambda d: d["native_location_data"]["recent"],
if end_cursor: lambda m: Post.from_iphone_struct(self.context, m),
params = {'__a': 1, 'max_id': end_cursor} f"explore/locations/{location}/",
else: )
params = {'__a': 1}
location_data = self.context.get_json('explore/locations/{0}/'.format(location),
params)['graphql']['location']['edge_location_to_media']
yield from (Post(self.context, edge['node']) for edge in location_data['edges'])
has_next_page = location_data['page_info']['has_next_page']
end_cursor = location_data['page_info']['end_cursor']
@_requires_login @_requires_login
def download_location(self, location: str, def download_location(self, location: str,

View File

@ -0,0 +1,46 @@
from typing import Any, Callable, Dict, Iterator, Optional, TypeVar
from .instaloadercontext import InstaloaderContext
T = TypeVar('T')
class SectionIterator(Iterator[T]):
"""Iterator for the new 'sections'-style responses.
.. versionadded:: 4.9"""
def __init__(self,
context: InstaloaderContext,
sections_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
media_wrapper: Callable[[Dict], T],
query_path: str,
first_data: Optional[Dict[str, Any]] = None):
self._context = context
self._sections_extractor = sections_extractor
self._media_wrapper = media_wrapper
self._query_path = query_path
self._data = first_data or self._query()
self._page_index = 0
self._section_index = 0
def __iter__(self):
return self
def _query(self, max_id: Optional[str] = None) -> Dict[str, Any]:
pagination_variables = {"max_id": max_id} if max_id is not None else {}
return self._sections_extractor(
self._context.get_json(self._query_path, params={"__a": 1, **pagination_variables})
)
def __next__(self) -> T:
if self._page_index < len(self._data['sections']):
media = self._data['sections'][self._page_index]['layout_content']['medias'][self._section_index]['media']
self._section_index += 1
if self._section_index >= len(self._data['sections'][self._page_index]['layout_content']['medias']):
self._section_index = 0
self._page_index += 1
return self._media_wrapper(media)
if self._data['more_available']:
self._page_index, self._section_index, self._data = 0, 0, self._query(self._data["next_max_id"])
return self.__next__()
raise StopIteration()

View File

@ -3,7 +3,9 @@ import lzma
import re import re
from base64 import b64decode, b64encode from base64 import b64decode, b64encode
from collections import namedtuple from collections import namedtuple
from contextlib import suppress
from datetime import datetime from datetime import datetime
from itertools import islice
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
@ -11,6 +13,7 @@ from . import __version__
from .exceptions import * from .exceptions import *
from .instaloadercontext import InstaloaderContext from .instaloadercontext import InstaloaderContext
from .nodeiterator import FrozenNodeIterator, NodeIterator from .nodeiterator import FrozenNodeIterator, NodeIterator
from .sectioniterator import SectionIterator
PostSidecarNode = namedtuple('PostSidecarNode', ['is_video', 'display_url', 'video_url']) PostSidecarNode = namedtuple('PostSidecarNode', ['is_video', 'display_url', 'video_url'])
PostSidecarNode.__doc__ = "Item of a Sidecar Post." PostSidecarNode.__doc__ = "Item of a Sidecar Post."
@ -89,6 +92,41 @@ class Post:
"""Create a post object from a given mediaid""" """Create a post object from a given mediaid"""
return cls.from_shortcode(context, Post.mediaid_to_shortcode(mediaid)) return cls.from_shortcode(context, Post.mediaid_to_shortcode(mediaid))
@classmethod
def from_iphone_struct(cls, context: InstaloaderContext, media: Dict[str, Any]):
"""Create a post from a given iphone_struct.
.. versionadded:: 4.9"""
media_types = {
1: "GraphImage",
2: "GraphVideo",
8: "GraphSidecar",
}
fake_node = {
"shortcode": media["code"],
"id": media["pk"],
"__typename": media_types[media["media_type"]],
"is_video": media_types[media["media_type"]] == "GraphVideo",
"date": media["taken_at"],
"caption": media["caption"].get("text") if media.get("caption") is not None else None,
"title": media.get("title"),
"viewer_has_liked": media["has_liked"],
"edge_media_preview_like": {"count": media["like_count"]},
"iphone_struct": media,
}
with suppress(KeyError):
fake_node["display_url"] = media['image_versions2']['candidates'][0]['url']
with suppress(KeyError):
fake_node["video_url"] = media['video_versions'][-1]['url']
fake_node["video_duration"] = media["video_duration"]
fake_node["video_view_count"] = media["view_count"]
with suppress(KeyError):
fake_node["edge_sidecar_to_children"] = {"edges": [{"node": {
"display_url": node['image_versions2']['candidates'][0]['url'],
"is_video": media_types[node["media_type"]] == "GraphVideo",
}} for node in media["carousel_media"]]}
return cls(context, fake_node, Profile.from_iphone_struct(context, media["user"]) if "user" in media else None)
@staticmethod @staticmethod
def shortcode_to_mediaid(code: str) -> int: def shortcode_to_mediaid(code: str) -> int:
if len(code) > 11: if len(code) > 11:
@ -665,6 +703,20 @@ class Profile:
context.profile_id_cache[profile_id] = profile context.profile_id_cache[profile_id] = profile
return profile return profile
@classmethod
def from_iphone_struct(cls, context: InstaloaderContext, media: Dict[str, Any]):
"""Create a profile from a given iphone_struct.
.. versionadded:: 4.9"""
return cls(context, {
"id": media["pk"],
"username": media["username"],
"is_private": media["is_private"],
"full_name": media["full_name"],
"profile_pic_url_hd": media["profile_pic_url"],
"iphone_struct": media,
})
@classmethod @classmethod
def own_profile(cls, context: InstaloaderContext): def own_profile(cls, context: InstaloaderContext):
"""Return own profile if logged-in. """Return own profile if logged-in.
@ -1359,6 +1411,9 @@ class Hashtag:
L.download_post(post, target="#"+hashtag.name) L.download_post(post, target="#"+hashtag.name)
Also, this class implements == and is hashable. Also, this class implements == and is hashable.
.. versionchanged:: 4.9
Removed ``get_related_tags()`` and ``is_top_media_only`` as these features were removed from Instagram.
""" """
def __init__(self, context: InstaloaderContext, node: Dict[str, Any]): def __init__(self, context: InstaloaderContext, node: Dict[str, Any]):
assert "name" in node assert "name" in node
@ -1387,8 +1442,8 @@ class Hashtag:
return self._node["name"].lower() return self._node["name"].lower()
def _query(self, params): def _query(self, params):
return self._context.get_json("explore/tags/{0}/".format(self.name), json_response = self._context.get_json("explore/tags/{0}/".format(self.name), params)
params)["graphql"]["hashtag"] return json_response["graphql"]["hashtag"] if "graphql" in json_response else json_response["data"]
def _obtain_metadata(self): def _obtain_metadata(self):
if not self._has_full_metadata: if not self._has_full_metadata:
@ -1399,7 +1454,9 @@ class Hashtag:
json_node = self._node.copy() json_node = self._node.copy()
# remove posts # remove posts
json_node.pop("edge_hashtag_to_top_posts", None) json_node.pop("edge_hashtag_to_top_posts", None)
json_node.pop("top", None)
json_node.pop("edge_hashtag_to_media", None) json_node.pop("edge_hashtag_to_media", None)
json_node.pop("recent", None)
return json_node return json_node
def __repr__(self): def __repr__(self):
@ -1435,30 +1492,33 @@ class Hashtag:
return self._metadata("profile_pic_url") return self._metadata("profile_pic_url")
@property @property
def description(self) -> str: def description(self) -> Optional[str]:
return self._metadata("description") return self._metadata("description")
@property @property
def allow_following(self) -> bool: def allow_following(self) -> bool:
return self._metadata("allow_following") return bool(self._metadata("allow_following"))
@property @property
def is_following(self) -> bool: def is_following(self) -> bool:
try:
return self._metadata("is_following") return self._metadata("is_following")
except KeyError:
@property return bool(self._metadata("following"))
def is_top_media_only(self) -> bool:
return self._metadata("is_top_media_only")
def get_related_tags(self) -> Iterator["Hashtag"]:
"""Yields similar hashtags."""
yield from (Hashtag(self._context, edge["node"])
for edge in self._metadata("edge_hashtag_to_related_tags", "edges"))
def get_top_posts(self) -> Iterator[Post]: def get_top_posts(self) -> Iterator[Post]:
"""Yields the top posts of the hashtag.""" """Yields the top posts of the hashtag."""
try:
yield from (Post(self._context, edge["node"]) yield from (Post(self._context, edge["node"])
for edge in self._metadata("edge_hashtag_to_top_posts", "edges")) for edge in self._metadata("edge_hashtag_to_top_posts", "edges"))
except KeyError:
yield from SectionIterator(
self._context,
lambda d: d["data"]["top"],
lambda m: Post.from_iphone_struct(self._context, m),
f"explore/tags/{self.name}/",
self._metadata("top"),
)
@property @property
def mediacount(self) -> int: def mediacount(self) -> int:
@ -1468,10 +1528,14 @@ class Hashtag:
The number of posts with a certain hashtag may differ from the number of posts that can actually be accessed, as The number of posts with a certain hashtag may differ from the number of posts that can actually be accessed, as
the hashtag count might include private posts the hashtag count might include private posts
""" """
try:
return self._metadata("edge_hashtag_to_media", "count") return self._metadata("edge_hashtag_to_media", "count")
except KeyError:
return self._metadata("media_count")
def get_posts(self) -> Iterator[Post]: def get_posts(self) -> Iterator[Post]:
"""Yields the posts associated with this hashtag.""" """Yields the recent posts associated with this hashtag."""
try:
self._metadata("edge_hashtag_to_media", "edges") self._metadata("edge_hashtag_to_media", "edges")
self._metadata("edge_hashtag_to_media", "page_info") self._metadata("edge_hashtag_to_media", "page_info")
conn = self._metadata("edge_hashtag_to_media") conn = self._metadata("edge_hashtag_to_media")
@ -1480,10 +1544,18 @@ class Hashtag:
data = self._query({'__a': 1, 'max_id': conn["page_info"]["end_cursor"]}) data = self._query({'__a': 1, 'max_id': conn["page_info"]["end_cursor"]})
conn = data["edge_hashtag_to_media"] conn = data["edge_hashtag_to_media"]
yield from (Post(self._context, edge["node"]) for edge in conn["edges"]) yield from (Post(self._context, edge["node"]) for edge in conn["edges"])
except KeyError:
yield from SectionIterator(
self._context,
lambda d: d["data"]["recent"],
lambda m: Post.from_iphone_struct(self._context, m),
f"explore/tags/{self.name}/",
self._metadata("recent"),
)
def get_all_posts(self) -> Iterator[Post]: def get_all_posts(self) -> Iterator[Post]:
"""Yields all posts, i.e. all most recent posts and the top posts, in almost-chronological order.""" """Yields all posts, i.e. all most recent posts and the top posts, in almost-chronological order."""
sorted_top_posts = iter(sorted(self.get_top_posts(), key=lambda p: p.date_utc, reverse=True)) sorted_top_posts = iter(sorted(islice(self.get_top_posts(), 9), key=lambda p: p.date_utc, reverse=True))
other_posts = self.get_posts() other_posts = self.get_posts()
next_top = next(sorted_top_posts, None) next_top = next(sorted_top_posts, None)
next_other = next(other_posts, None) next_other = next(other_posts, None)
@ -1510,6 +1582,20 @@ class Hashtag:
yield next_other yield next_other
next_other = next(other_posts, None) next_other = next(other_posts, None)
def get_posts_resumable(self) -> NodeIterator[Post]:
"""Get the recent posts of the hashtag in a resumable fashion.
:rtype: NodeIterator[Post]
.. versionadded:: 4.9"""
return NodeIterator(
self._context, "9b498c08113f1e09617a1703c22b2f32",
lambda d: d['data']['hashtag']['edge_hashtag_to_media'],
lambda n: Post(self._context, n),
{'tag_name': self.name},
f"https://www.instagram.com/explore/tags/{self.name}/"
)
class TopSearchResults: class TopSearchResults:
""" """