1717# You should have received a copy of the GNU Lesser Public License
1818# along with this program. If not, see [http://www.gnu.org/licenses/].
1919"""This module contains helper functions."""
20- from html import escape
21-
2220import re
23- from collections import OrderedDict
2421import signal
22+ from collections import OrderedDict
2523from datetime import datetime
2624
27- try :
28- from urllib .parse import urlparse
29- except ImportError :
30- from urlparse import urlparse
31-
3225try :
3326 from html import escape as escape_html # noqa: F401
3427except ImportError :
3528 from cgi import escape as escape_html # noqa: F401
3629
37-
3830# From https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
3931_signames = {v : k
4032 for k , v in reversed (sorted (vars (signal ).items ()))
@@ -56,7 +48,6 @@ def _timestamp(dt_obj):
5648 # Python < 3.3 (incl 2.7)
5749 from time import mktime
5850
59-
6051 def _timestamp (dt_obj ):
6152 return mktime (dt_obj .timetuple ())
6253
@@ -107,7 +98,7 @@ def mention_html(user_id, name):
10798 :obj:`str`: The inline mention for the user as html.
10899 """
109100 if isinstance (user_id , int ):
110- return '<a href="tg://user?id={}">{}</a>' .format (user_id , escape (name ))
101+ return '<a href="tg://user?id={}">{}</a>' .format (user_id , escape_html (name ))
111102
112103
113104def mention_markdown (user_id , name ):
@@ -154,23 +145,11 @@ def effective_message_type(entity):
154145 return None
155146
156147
157- def _extract_urls_from_text (text ):
158- """
159- Returns a list of urls from a text string.
160- URLs without a leading `http://` or `www.` won't be found.
161- """
162- out = []
163- for word in text .split (' ' ):
164- thing = urlparse (word .strip ())
165- if thing .scheme :
166- out .append (word )
167- return out
168-
169-
170148def extract_urls (message ):
171149 """
172150 Extracts all Hyperlinks that are contained in a message. This includes
173- message entities and the media caption. Distinct links are returned in order of appearance.
151+ message entities and the media caption. Distinct links are returned in order of appearance,
152+ while links in the text take precedence over ones in the media caption.
174153
175154 Note: Exact duplicates are removed, but there may still be URLs that link
176155 to the same resource.
@@ -186,11 +165,12 @@ def extract_urls(message):
186165 types = [MessageEntity .URL , MessageEntity .TEXT_LINK ]
187166 results = message .parse_entities (types = types )
188167 results .update (message .parse_caption_entities (types = types ))
189- all_urls = [v if k .type == MessageEntity .URL else k .url for k , v in results .items ()]
168+
169+ all_urls = (v if k .type == MessageEntity .URL else k .url for k , v in results .items ())
190170
191171 # Strip trailing slash from URL so we can compare them for equality
192- stripped_urls = [ x [: - 1 ] if x [ - 1 ] == '/' else x for x in all_urls ]
172+ stripped_urls = ( x . rstrip ( '/' ) for x in all_urls )
193173
194- # Remove exact duplicates, compliant with legacy python
174+ # Remove exact duplicates, in a way that is compliant with legacy python
195175 urls = OrderedDict ({k : None for k in stripped_urls })
196176 return list (urls .keys ())
0 commit comments