From 8784ac7d9b976c034df6d6ebd1fbb8c6e2825c0f Mon Sep 17 00:00:00 2001 From: MiguelX413 Date: Fri, 15 Jul 2022 12:14:16 -0700 Subject: [PATCH] Fix and globalize regex --- instaloader/structures.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/instaloader/structures.py b/instaloader/structures.py index b037cae..4852c21 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -73,6 +73,22 @@ PostLocation.has_public_page.__doc__ = "Whether location has a public page." PostLocation.lat.__doc__ = "Latitude (:class:`float` or None)." PostLocation.lng.__doc__ = "Longitude (:class:`float` or None)." +# This regular expression is by MiguelX413 +_hashtag_regex = re.compile(r"(?:#)((?:\w){1,150})") + +# This regular expression is modified from jStassen, adjusted to use Python's \w to +# support Unicode and a word/beginning of string delimiter at the beginning to ensure +# that no email addresses join the list of mentions. +# http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ +_mention_regex = re.compile(r"(?:^|[^\w\n]|_)(?:@)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)", re.ASCII) + + +def _optional_normalize(string: Optional[str]) -> Optional[str]: + if string is not None: + return normalize("NFC", string) + else: + return None + class Post: """ @@ -396,16 +412,10 @@ class Post: @property def caption(self) -> Optional[str]: """Caption.""" - def _normalize(string: Optional[str]) -> Optional[str]: - if string is not None: - return normalize("NFC", string) - else: - return None - if "edge_media_to_caption" in self._node and self._node["edge_media_to_caption"]["edges"]: - return _normalize(self._node["edge_media_to_caption"]["edges"][0]["node"]["text"]) + return _optional_normalize(self._node["edge_media_to_caption"]["edges"][0]["node"]["text"]) elif "caption" in self._node: - return _normalize(self._node["caption"]) + return _optional_normalize(self._node["caption"]) return None @property @@ -413,22 +423,14 @@ class Post: """List of all lowercased hashtags (without preceeding #) that occur in the Post's caption.""" if not self.caption: return [] - # This regular expression is from jStassen, adjusted to use Python's \w to support Unicode - # http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ - hashtag_regex = re.compile(r"(?:#)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)") - return re.findall(hashtag_regex, self.caption.lower()) + return _hashtag_regex.findall(self.caption.lower()) @property def caption_mentions(self) -> List[str]: """List of all lowercased profiles that are mentioned in the Post's caption, without preceeding @.""" if not self.caption: return [] - # This regular expression is modified from jStassen, adjusted to use Python's \w to - # support Unicode and a word/beginning of string delimiter at the beginning to ensure - # that no email addresses join the list of mentions. - # http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ - mention_regex = re.compile(r"(?:^|\W|_)(?:@)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)", re.ASCII) - return re.findall(mention_regex, self.caption.lower()) + return _mention_regex.findall(self.caption.lower()) @property def pcaption(self) -> str: