parent
74d5e35eb8
commit
73ec884ea4
@ -3,6 +3,7 @@
|
|||||||
"""Download pictures (or videos) along with their captions and other metadata from Instagram."""
|
"""Download pictures (or videos) along with their captions and other metadata from Instagram."""
|
||||||
import ast
|
import ast
|
||||||
import getpass
|
import getpass
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
@ -201,6 +202,7 @@ class Post:
|
|||||||
self._profile = profile
|
self._profile = profile
|
||||||
self._profile_id = profile_id
|
self._profile_id = profile_id
|
||||||
self._full_metadata_dict = None
|
self._full_metadata_dict = None
|
||||||
|
self._rhx_gis = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_shortcode(cls, instaloader: 'Instaloader', shortcode: str):
|
def from_shortcode(cls, instaloader: 'Instaloader', shortcode: str):
|
||||||
@ -239,11 +241,9 @@ class Post:
|
|||||||
@property
|
@property
|
||||||
def _full_metadata(self) -> Dict[str, Any]:
|
def _full_metadata(self) -> Dict[str, Any]:
|
||||||
if not self._full_metadata_dict:
|
if not self._full_metadata_dict:
|
||||||
pic_json = self._instaloader.get_json("p/{0}/".format(self.shortcode), params={'__a': 1})
|
pic_json = self._instaloader.get_json("p/{0}/".format(self.shortcode), params={})
|
||||||
if "graphql" in pic_json:
|
self._full_metadata_dict = pic_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']
|
||||||
self._full_metadata_dict = pic_json["graphql"]["shortcode_media"]
|
self._rhx_gis = pic_json['rhx_gis']
|
||||||
else:
|
|
||||||
self._full_metadata_dict = pic_json["media"]
|
|
||||||
return self._full_metadata_dict
|
return self._full_metadata_dict
|
||||||
|
|
||||||
def _field(self, *keys) -> Any:
|
def _field(self, *keys) -> Any:
|
||||||
@ -387,7 +387,8 @@ class Post:
|
|||||||
return
|
return
|
||||||
yield from self._instaloader.graphql_node_list(17852405266163336, {'shortcode': self.shortcode},
|
yield from self._instaloader.graphql_node_list(17852405266163336, {'shortcode': self.shortcode},
|
||||||
'https://www.instagram.com/p/' + self.shortcode + '/',
|
'https://www.instagram.com/p/' + self.shortcode + '/',
|
||||||
lambda d: d['data']['shortcode_media']['edge_media_to_comment'])
|
lambda d: d['data']['shortcode_media']['edge_media_to_comment'],
|
||||||
|
rhx_gis=self._rhx_gis)
|
||||||
|
|
||||||
def get_likes(self) -> Iterator[Dict[str, Any]]:
|
def get_likes(self) -> Iterator[Dict[str, Any]]:
|
||||||
"""Iterate over all likes of the post.
|
"""Iterate over all likes of the post.
|
||||||
@ -405,7 +406,8 @@ class Post:
|
|||||||
return
|
return
|
||||||
yield from self._instaloader.graphql_node_list("1cb6ec562846122743b61e492c85999f", {'shortcode': self.shortcode},
|
yield from self._instaloader.graphql_node_list("1cb6ec562846122743b61e492c85999f", {'shortcode': self.shortcode},
|
||||||
'https://www.instagram.com/p/' + self.shortcode + '/',
|
'https://www.instagram.com/p/' + self.shortcode + '/',
|
||||||
lambda d: d['data']['shortcode_media']['edge_liked_by'])
|
lambda d: d['data']['shortcode_media']['edge_liked_by'],
|
||||||
|
rhx_gis=self._rhx_gis)
|
||||||
|
|
||||||
def get_location(self) -> Optional[Dict[str, str]]:
|
def get_location(self) -> Optional[Dict[str, str]]:
|
||||||
"""If the Post has a location, returns a dictionary with fields 'lat' and 'lng'."""
|
"""If the Post has a location, returns a dictionary with fields 'lat' and 'lng'."""
|
||||||
@ -602,7 +604,7 @@ class Instaloader:
|
|||||||
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
||||||
:raises ConnectionException: When query repeatedly failed.
|
:raises ConnectionException: When query repeatedly failed.
|
||||||
"""
|
"""
|
||||||
def graphql_query_waittime(query_id: int, untracked_queries: bool = False) -> int:
|
def graphql_query_waittime(query_id: Union[int, str], untracked_queries: bool = False) -> int:
|
||||||
sliding_window = 660
|
sliding_window = 660
|
||||||
timestamps = self.previous_queries.get(query_id)
|
timestamps = self.previous_queries.get(query_id)
|
||||||
if not timestamps:
|
if not timestamps:
|
||||||
@ -613,9 +615,9 @@ class Instaloader:
|
|||||||
if len(timestamps) < 100 and not untracked_queries:
|
if len(timestamps) < 100 and not untracked_queries:
|
||||||
return 0
|
return 0
|
||||||
return round(min(timestamps) + sliding_window - current_time) + 6
|
return round(min(timestamps) + sliding_window - current_time) + 6
|
||||||
is_graphql_query = 'query_id' in params and 'graphql/query' in path
|
is_graphql_query = 'graphql/query' in path
|
||||||
if is_graphql_query:
|
if is_graphql_query:
|
||||||
query_id = params['query_id']
|
query_id = params['query_id'] if 'query_id' in params else params['query_hash']
|
||||||
waittime = graphql_query_waittime(query_id)
|
waittime = graphql_query_waittime(query_id)
|
||||||
if waittime > 0:
|
if waittime > 0:
|
||||||
self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime))
|
self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime))
|
||||||
@ -635,7 +637,13 @@ class Instaloader:
|
|||||||
raise TooManyRequests("429 - Too Many Requests")
|
raise TooManyRequests("429 - Too Many Requests")
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
|
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
|
||||||
resp_json = resp.json()
|
if not is_graphql_query and not "__a" in params and host == "www.instagram.com":
|
||||||
|
match = re.search(r'window\._sharedData = (.*);</script>', resp.text)
|
||||||
|
if match is None:
|
||||||
|
raise ConnectionException("Could not find \"window._sharedData\" in html response.")
|
||||||
|
return json.loads(match.group(1))
|
||||||
|
else:
|
||||||
|
resp_json = resp.json()
|
||||||
if 'status' in resp_json and resp_json['status'] != "ok":
|
if 'status' in resp_json and resp_json['status'] != "ok":
|
||||||
if 'message' in resp_json:
|
if 'message' in resp_json:
|
||||||
raise ConnectionException("Returned \"{}\" status, message \"{}\".".format(resp_json['status'],
|
raise ConnectionException("Returned \"{}\" status, message \"{}\".".format(resp_json['status'],
|
||||||
@ -695,7 +703,7 @@ class Instaloader:
|
|||||||
return session
|
return session
|
||||||
|
|
||||||
def graphql_query(self, query_identifier: Union[int, str], variables: Dict[str, Any],
|
def graphql_query(self, query_identifier: Union[int, str], variables: Dict[str, Any],
|
||||||
referer: Optional[str] = None) -> Dict[str, Any]:
|
referer: Optional[str] = None, rhx_gis: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Do a GraphQL Query.
|
Do a GraphQL Query.
|
||||||
|
|
||||||
@ -713,9 +721,18 @@ class Instaloader:
|
|||||||
tmpsession.headers['accept'] = '*/*'
|
tmpsession.headers['accept'] = '*/*'
|
||||||
if referer is not None:
|
if referer is not None:
|
||||||
tmpsession.headers['referer'] = urllib.parse.quote(referer)
|
tmpsession.headers['referer'] = urllib.parse.quote(referer)
|
||||||
|
|
||||||
|
variables_json = json.dumps(variables, separators=(',', ':'))
|
||||||
|
|
||||||
|
if rhx_gis:
|
||||||
|
values = "{}:{}:{}:{}".format(rhx_gis, tmpsession.cookies['csrftoken'], self.user_agent, variables_json)
|
||||||
|
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
|
||||||
|
tmpsession.cookies.set('ig_pr', '2')
|
||||||
|
tmpsession.headers['x-instagram-gis'] = x_instagram_gis
|
||||||
|
|
||||||
resp_json = self.get_json('graphql/query',
|
resp_json = self.get_json('graphql/query',
|
||||||
params={'query_id' if isinstance(query_identifier, int) else 'query_hash': query_identifier,
|
params={'query_id' if isinstance(query_identifier, int) else 'query_hash': query_identifier,
|
||||||
'variables': json.dumps(variables, separators=(',', ':'))},
|
'variables': variables_json},
|
||||||
session=tmpsession)
|
session=tmpsession)
|
||||||
if 'status' not in resp_json:
|
if 'status' not in resp_json:
|
||||||
self.error("GraphQL response did not contain a \"status\" field.")
|
self.error("GraphQL response did not contain a \"status\" field.")
|
||||||
@ -740,20 +757,21 @@ class Instaloader:
|
|||||||
def get_id_by_username(self, profile: str) -> int:
|
def get_id_by_username(self, profile: str) -> int:
|
||||||
"""Each Instagram profile has its own unique ID which stays unmodified even if a user changes
|
"""Each Instagram profile has its own unique ID which stays unmodified even if a user changes
|
||||||
his/her username. To get said ID, given the profile's name, you may call this function."""
|
his/her username. To get said ID, given the profile's name, you may call this function."""
|
||||||
return int(self.get_profile_metadata(profile)['user']['id'])
|
return int(self.get_profile_metadata(profile)[0]['user']['id'])
|
||||||
|
|
||||||
def graphql_node_list(self, query_identifier: Union[int, str], query_variables: Dict[str, Any],
|
def graphql_node_list(self, query_identifier: Union[int, str], query_variables: Dict[str, Any],
|
||||||
query_referer: Optional[str],
|
query_referer: Optional[str],
|
||||||
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]]) -> Iterator[Dict[str, Any]]:
|
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
|
||||||
|
rhx_gis: Optional[str] = None) -> Iterator[Dict[str, Any]]:
|
||||||
"""Retrieve a list of GraphQL nodes."""
|
"""Retrieve a list of GraphQL nodes."""
|
||||||
query_variables['first'] = Instaloader.GRAPHQL_PAGE_LENGTH
|
query_variables['first'] = Instaloader.GRAPHQL_PAGE_LENGTH
|
||||||
data = self.graphql_query(query_identifier, query_variables, query_referer)
|
data = self.graphql_query(query_identifier, query_variables, query_referer, rhx_gis)
|
||||||
while True:
|
while True:
|
||||||
edge_struct = edge_extractor(data)
|
edge_struct = edge_extractor(data)
|
||||||
yield from [edge['node'] for edge in edge_struct['edges']]
|
yield from [edge['node'] for edge in edge_struct['edges']]
|
||||||
if edge_struct['page_info']['has_next_page']:
|
if edge_struct['page_info']['has_next_page']:
|
||||||
query_variables['after'] = edge_struct['page_info']['end_cursor']
|
query_variables['after'] = edge_struct['page_info']['end_cursor']
|
||||||
data = self.graphql_query(query_identifier, query_variables, query_referer)
|
data = self.graphql_query(query_identifier, query_variables, query_referer, rhx_gis)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -1257,7 +1275,7 @@ class Instaloader:
|
|||||||
|
|
||||||
if not self.is_logged_in:
|
if not self.is_logged_in:
|
||||||
return
|
return
|
||||||
data = self.get_profile_metadata(self.username)
|
data, full_metadata = self.get_profile_metadata(self.username)
|
||||||
user_id = data["user"]["id"]
|
user_id = data["user"]["id"]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@ -1277,7 +1295,8 @@ class Instaloader:
|
|||||||
break
|
break
|
||||||
data = self.graphql_query("f883d95537fbcd400f466f63d42bd8a1",
|
data = self.graphql_query("f883d95537fbcd400f466f63d42bd8a1",
|
||||||
{'id': user_id, 'first': Instaloader.GRAPHQL_PAGE_LENGTH,
|
{'id': user_id, 'first': Instaloader.GRAPHQL_PAGE_LENGTH,
|
||||||
'after': saved_media["page_info"]["end_cursor"]})['data']
|
'after': saved_media["page_info"]["end_cursor"]},
|
||||||
|
rhx_gis=full_metadata['rhx_gis'])['data']
|
||||||
|
|
||||||
def download_saved_posts(self, max_count: int = None, fast_update: bool = False,
|
def download_saved_posts(self, max_count: int = None, fast_update: bool = False,
|
||||||
filter_func: Optional[Callable[[Post], bool]] = None) -> None:
|
filter_func: Optional[Callable[[Post], bool]] = None) -> None:
|
||||||
@ -1403,15 +1422,15 @@ class Instaloader:
|
|||||||
return profile, profile_id
|
return profile, profile_id
|
||||||
raise ProfileNotExistsException("Profile {0} does not exist.".format(profile))
|
raise ProfileNotExistsException("Profile {0} does not exist.".format(profile))
|
||||||
|
|
||||||
def get_profile_metadata(self, profile_name: str) -> Dict[str, Any]:
|
def get_profile_metadata(self, profile_name: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
"""Retrieves a profile's metadata, for use with e.g. :meth:`get_profile_posts` and :meth:`check_profile_id`."""
|
"""Retrieves a profile's metadata, for use with e.g. :meth:`get_profile_posts` and :meth:`check_profile_id`."""
|
||||||
try:
|
try:
|
||||||
metadata = self.get_json('{}/'.format(profile_name), params={'__a': 1})
|
metadata = self.get_json('{}/'.format(profile_name), params={})
|
||||||
return metadata['graphql'] if 'graphql' in metadata else metadata
|
return metadata['entry_data']['ProfilePage'][0]['graphql'], metadata
|
||||||
except QueryReturnedNotFoundException:
|
except QueryReturnedNotFoundException:
|
||||||
raise ProfileNotExistsException('Profile {} does not exist.'.format(profile_name))
|
raise ProfileNotExistsException('Profile {} does not exist.'.format(profile_name))
|
||||||
|
|
||||||
def get_profile_posts(self, profile_metadata: Dict[str, Any]) -> Iterator[Post]:
|
def get_profile_posts(self, profile_metadata: Dict[str, Any], rhx_gis: str) -> Iterator[Post]:
|
||||||
"""Retrieve all posts from a profile."""
|
"""Retrieve all posts from a profile."""
|
||||||
profile_name = profile_metadata['user']['username']
|
profile_name = profile_metadata['user']['username']
|
||||||
profile_id = int(profile_metadata['user']['id'])
|
profile_id = int(profile_metadata['user']['id'])
|
||||||
@ -1432,7 +1451,7 @@ class Instaloader:
|
|||||||
data = self.graphql_query(17888483320059182, {'id': profile_metadata['user']['id'],
|
data = self.graphql_query(17888483320059182, {'id': profile_metadata['user']['id'],
|
||||||
'first': Instaloader.GRAPHQL_PAGE_LENGTH,
|
'first': Instaloader.GRAPHQL_PAGE_LENGTH,
|
||||||
'after': end_cursor},
|
'after': end_cursor},
|
||||||
'https://www.instagram.com/{0}/'.format(profile_name))
|
referer='https://www.instagram.com/{0}/'.format(profile_name), rhx_gis=rhx_gis)
|
||||||
media = data['data']['user']['edge_owner_to_timeline_media']
|
media = data['data']['user']['edge_owner_to_timeline_media']
|
||||||
yield from (Post(self, edge['node'], profile=profile_name, profile_id=profile_id)
|
yield from (Post(self, edge['node'], profile=profile_name, profile_id=profile_id)
|
||||||
for edge in media['edges'])
|
for edge in media['edges'])
|
||||||
@ -1452,14 +1471,14 @@ class Instaloader:
|
|||||||
with suppress(ProfileNotExistsException):
|
with suppress(ProfileNotExistsException):
|
||||||
# ProfileNotExistsException is raised again later in check_profile_id() when we search the profile, so we
|
# ProfileNotExistsException is raised again later in check_profile_id() when we search the profile, so we
|
||||||
# must suppress it here.
|
# must suppress it here.
|
||||||
profile_metadata = self.get_profile_metadata(name)
|
profile_metadata, full_metadata = self.get_profile_metadata(name)
|
||||||
|
|
||||||
# check if profile does exist or name has changed since last download
|
# check if profile does exist or name has changed since last download
|
||||||
# and update name and json data if necessary
|
# and update name and json data if necessary
|
||||||
name_updated, profile_id = self.check_profile_id(name, profile_metadata)
|
name_updated, profile_id = self.check_profile_id(name, profile_metadata)
|
||||||
if name_updated != name:
|
if name_updated != name:
|
||||||
name = name_updated
|
name = name_updated
|
||||||
profile_metadata = self.get_profile_metadata(name)
|
profile_metadata, full_metadata = self.get_profile_metadata(name)
|
||||||
|
|
||||||
# Download profile picture
|
# Download profile picture
|
||||||
if profile_pic or profile_pic_only:
|
if profile_pic or profile_pic_only:
|
||||||
@ -1494,7 +1513,7 @@ class Instaloader:
|
|||||||
else:
|
else:
|
||||||
totalcount = profile_metadata["user"]["edge_owner_to_timeline_media"]["count"]
|
totalcount = profile_metadata["user"]["edge_owner_to_timeline_media"]["count"]
|
||||||
count = 1
|
count = 1
|
||||||
for post in self.get_profile_posts(profile_metadata):
|
for post in self.get_profile_posts(profile_metadata, rhx_gis=full_metadata['rhx_gis']):
|
||||||
self._log("[%3i/%3i] " % (count, totalcount), end="", flush=True)
|
self._log("[%3i/%3i] " % (count, totalcount), end="", flush=True)
|
||||||
count += 1
|
count += 1
|
||||||
if filter_func is not None and not filter_func(post):
|
if filter_func is not None and not filter_func(post):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user