Name profilepic by hash if Last-Modified missing
This fixes #188. Also, this commit prevents double-requesting the profile pic URL to obtain the Last-Modified header.
This commit is contained in:
parent
06845b53fc
commit
4bc0a94e12
@ -11,6 +11,7 @@ import tempfile
|
||||
from contextlib import contextmanager, suppress
|
||||
from datetime import datetime, timezone
|
||||
from functools import wraps
|
||||
from hashlib import md5
|
||||
from io import BytesIO
|
||||
from typing import Any, Callable, Iterator, List, Optional, Set, Union
|
||||
|
||||
@ -289,26 +290,32 @@ class Instaloader:
|
||||
"""Downloads and saves profile pic."""
|
||||
|
||||
def _epoch_to_string(epoch: datetime) -> str:
|
||||
return epoch.strftime('%Y-%m-%d_%H-%M-%S')
|
||||
return epoch.strftime('%Y-%m-%d_%H-%M-%S_UTC')
|
||||
|
||||
profile_pic_url = profile.profile_pic_url
|
||||
with self.context.get_anonymous_session() as anonymous_session:
|
||||
date_object = datetime.strptime(anonymous_session.head(profile_pic_url).headers["Last-Modified"],
|
||||
'%a, %d %b %Y %H:%M:%S GMT')
|
||||
profile_pic_response = self.context.get_raw(profile.profile_pic_url)
|
||||
if 'Last-Modified' in profile_pic_response.headers:
|
||||
date_object = datetime.strptime(profile_pic_response.headers["Last-Modified"], '%a, %d %b %Y %H:%M:%S GMT')
|
||||
profile_pic_bytes = None
|
||||
profile_pic_identifier = _epoch_to_string(date_object)
|
||||
else:
|
||||
date_object = None
|
||||
profile_pic_bytes = profile_pic_response.content
|
||||
profile_pic_identifier = md5(profile_pic_bytes).hexdigest()[:16]
|
||||
profile_pic_extension = 'jpg'
|
||||
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
||||
format_string_contains_key(self.dirname_pattern, 'target'))):
|
||||
filename = '{0}/{1}_UTC_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(),
|
||||
target=profile.username.lower()),
|
||||
_epoch_to_string(date_object), profile_pic_extension)
|
||||
filename = '{0}/{1}_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(),
|
||||
target=profile.username.lower()),
|
||||
profile_pic_identifier, profile_pic_extension)
|
||||
else:
|
||||
filename = '{0}/{1}_{2}_UTC_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(),
|
||||
_epoch_to_string(date_object), profile_pic_extension)
|
||||
filename = '{0}/{1}_{2}_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(),
|
||||
profile_pic_identifier, profile_pic_extension)
|
||||
if os.path.isfile(filename):
|
||||
self.context.log(filename + ' already exists')
|
||||
return None
|
||||
self.context.get_and_write_raw(profile_pic_url, filename)
|
||||
os.utime(filename, (datetime.now().timestamp(), date_object.timestamp()))
|
||||
self.context.write_raw(profile_pic_bytes if profile_pic_bytes else profile_pic_response, filename)
|
||||
if date_object:
|
||||
os.utime(filename, (datetime.now().timestamp(), date_object.timestamp()))
|
||||
self.context.log('') # log output of _get_and_write_raw() does not produce \n
|
||||
|
||||
@_requires_login
|
||||
|
@ -10,7 +10,7 @@ import time
|
||||
import urllib.parse
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Callable, Dict, Iterator, Optional
|
||||
from typing import Any, Callable, Dict, Iterator, Optional, Union
|
||||
|
||||
import requests
|
||||
import requests.utils
|
||||
@ -429,8 +429,17 @@ class InstaloaderContext:
|
||||
data = _query()
|
||||
yield from (edge['node'] for edge in data['edges'])
|
||||
|
||||
def get_and_write_raw(self, url: str, filename: str, _attempt=1) -> None:
|
||||
"""Downloads raw data.
|
||||
def write_raw(self, resp: Union[bytes, requests.Response], filename: str) -> None:
|
||||
"""Write raw response data into a file."""
|
||||
self.log(filename, end=' ', flush=True)
|
||||
with open(filename, 'wb') as file:
|
||||
if isinstance(resp, requests.Response):
|
||||
shutil.copyfileobj(resp.raw, file)
|
||||
else:
|
||||
file.write(resp)
|
||||
|
||||
def get_raw(self, url: str, _attempt=1) -> requests.Response:
|
||||
"""Downloads a file anonymously.
|
||||
|
||||
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
||||
:raises QueryReturnedForbiddenException: When the server responds with a 403.
|
||||
@ -439,10 +448,8 @@ class InstaloaderContext:
|
||||
with self.get_anonymous_session() as anonymous_session:
|
||||
resp = anonymous_session.get(url, stream=True)
|
||||
if resp.status_code == 200:
|
||||
self.log(filename, end=' ', flush=True)
|
||||
with open(filename, 'wb') as file:
|
||||
resp.raw.decode_content = True
|
||||
shutil.copyfileobj(resp.raw, file)
|
||||
resp.raw.decode_content = True
|
||||
return resp
|
||||
else:
|
||||
if resp.status_code == 403:
|
||||
# suspected invalid URL signature
|
||||
@ -458,11 +465,19 @@ class InstaloaderContext:
|
||||
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
|
||||
try:
|
||||
self._sleep()
|
||||
self.get_and_write_raw(url, filename, _attempt + 1)
|
||||
return self.get_raw(url, _attempt + 1)
|
||||
except KeyboardInterrupt:
|
||||
self.error("[skipped by user]", repeat_at_end=False)
|
||||
raise ConnectionException(error_string) from err
|
||||
|
||||
def get_and_write_raw(self, url: str, filename: str) -> None:
|
||||
"""Downloads and writes anonymously-requested raw data into a file.
|
||||
|
||||
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
||||
:raises QueryReturnedForbiddenException: When the server responds with a 403.
|
||||
:raises ConnectionException: When download repeatedly failed."""
|
||||
self.write_raw(self.get_raw(url), filename)
|
||||
|
||||
@property
|
||||
def root_rhx_gis(self) -> Optional[str]:
|
||||
"""rhx_gis string returned in the / query."""
|
||||
|
Loading…
Reference in New Issue
Block a user