Primer commit del proyecto RSS

2025-05-24 14:37:58 +02:00 · 2025-05-24 14:37:58 +02:00 · 27c9515d29
commit 27c9515d29
1568 changed files with 252311 additions and 0 deletions
--- a/venv/lib/python3.12/site-packages/feedparser/urls.py
+++ b/venv/lib/python3.12/site-packages/feedparser/urls.py
@ -0,0 +1,155 @@
+# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2002-2008 Mark Pilgrim
+# All rights reserved.
+#
+# This file is a part of feedparser.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import re
+import urllib.parse
+
+from .html import _BaseHTMLProcessor
+
+# If you want feedparser to allow all URL schemes, set this to ()
+# List culled from Python's urlparse documentation at:
+#   http://docs.python.org/library/urlparse.html
+# as well as from "URI scheme" at Wikipedia:
+#   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
+# Many more will likely need to be added!
+ACCEPTABLE_URI_SCHEMES = (
+    'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
+    'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
+    'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
+    'wais',
+    # Additional common-but-unofficial schemes
+    'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
+    'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
+)
+
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+
+
+def _urljoin(base, uri):
+    uri = _urifixer.sub(r'\1\3', uri)
+    try:
+        uri = urllib.parse.urljoin(base, uri)
+    except ValueError:
+        uri = ''
+    return uri
+
+
+def convert_to_idn(url):
+    """Convert a URL to IDN notation"""
+    # this function should only be called with a unicode string
+    # strategy: if the host cannot be encoded in ascii, then
+    # it'll be necessary to encode it in idn form
+    parts = list(urllib.parse.urlsplit(url))
+    try:
+        parts[1].encode('ascii')
+    except UnicodeEncodeError:
+        # the url needs to be converted to idn notation
+        host = parts[1].rsplit(':', 1)
+        newhost = []
+        port = ''
+        if len(host) == 2:
+            port = host.pop()
+        for h in host[0].split('.'):
+            newhost.append(h.encode('idna').decode('utf-8'))
+        parts[1] = '.'.join(newhost)
+        if port:
+            parts[1] += ':' + port
+        return urllib.parse.urlunsplit(parts)
+    else:
+        return url
+
+
+def make_safe_absolute_uri(base, rel=None):
+    # bail if ACCEPTABLE_URI_SCHEMES is empty
+    if not ACCEPTABLE_URI_SCHEMES:
+        return _urljoin(base, rel or '')
+    if not base:
+        return rel or ''
+    if not rel:
+        try:
+            scheme = urllib.parse.urlparse(base)[0]
+        except ValueError:
+            return ''
+        if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+            return base
+        return ''
+    uri = _urljoin(base, rel)
+    if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+        return ''
+    return uri
+
+
+class RelativeURIResolver(_BaseHTMLProcessor):
+    relative_uris = {
+        ('a', 'href'),
+        ('applet', 'codebase'),
+        ('area', 'href'),
+        ('audio', 'src'),
+        ('blockquote', 'cite'),
+        ('body', 'background'),
+        ('del', 'cite'),
+        ('form', 'action'),
+        ('frame', 'longdesc'),
+        ('frame', 'src'),
+        ('iframe', 'longdesc'),
+        ('iframe', 'src'),
+        ('head', 'profile'),
+        ('img', 'longdesc'),
+        ('img', 'src'),
+        ('img', 'usemap'),
+        ('input', 'src'),
+        ('input', 'usemap'),
+        ('ins', 'cite'),
+        ('link', 'href'),
+        ('object', 'classid'),
+        ('object', 'codebase'),
+        ('object', 'data'),
+        ('object', 'usemap'),
+        ('q', 'cite'),
+        ('script', 'src'),
+        ('source', 'src'),
+        ('video', 'poster'),
+        ('video', 'src'),
+    }
+
+    def __init__(self, baseuri, encoding, _type):
+        _BaseHTMLProcessor.__init__(self, encoding, _type)
+        self.baseuri = baseuri
+
+    def resolve_uri(self, uri):
+        return make_safe_absolute_uri(self.baseuri, uri.strip())
+
+    def unknown_starttag(self, tag, attrs):
+        attrs = self.normalize_attrs(attrs)
+        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs]
+        super(RelativeURIResolver, self).unknown_starttag(tag, attrs)
+
+
+def resolve_relative_uris(html_source, base_uri, encoding, type_):
+    p = RelativeURIResolver(base_uri, encoding, type_)
+    p.feed(html_source)
+    return p.output()