Primer commit del proyecto RSS

2025-05-24 14:37:58 +02:00 · 2025-05-24 14:37:58 +02:00 · 27c9515d29
commit 27c9515d29
1568 changed files with 252311 additions and 0 deletions
--- a/venv/lib/python3.12/site-packages/feedparser/api.py
+++ b/venv/lib/python3.12/site-packages/feedparser/api.py
@ -0,0 +1,277 @@
+# The public API for feedparser
+# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2002-2008 Mark Pilgrim
+# All rights reserved.
+#
+# This file is a part of feedparser.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import urllib.error
+import urllib.parse
+import xml.sax
+
+from .datetimes import registerDateHandler, _parse_date
+from .encodings import convert_to_utf8
+from .exceptions import *
+from .html import _BaseHTMLProcessor
+from . import http
+from . import mixin
+from .mixin import _FeedParserMixin
+from .parsers.loose import _LooseFeedParser
+from .parsers.strict import _StrictFeedParser
+from .sanitizer import replace_doctype
+from .sgml import *
+from .urls import convert_to_idn, make_safe_absolute_uri
+from .util import FeedParserDict
+
+
+# List of preferred XML parsers, by SAX driver name.  These will be tried first,
+# but if they're not installed, Python will keep searching through its own list
+# of pre-installed parsers until it finds one that supports everything we need.
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
+
+_XML_AVAILABLE = True
+
+SUPPORTED_VERSIONS = {
+    '': 'unknown',
+    'rss090': 'RSS 0.90',
+    'rss091n': 'RSS 0.91 (Netscape)',
+    'rss091u': 'RSS 0.91 (Userland)',
+    'rss092': 'RSS 0.92',
+    'rss093': 'RSS 0.93',
+    'rss094': 'RSS 0.94',
+    'rss20': 'RSS 2.0',
+    'rss10': 'RSS 1.0',
+    'rss': 'RSS (unknown version)',
+    'atom01': 'Atom 0.1',
+    'atom02': 'Atom 0.2',
+    'atom03': 'Atom 0.3',
+    'atom10': 'Atom 1.0',
+    'atom': 'Atom (unknown version)',
+    'cdf': 'CDF',
+}
+
+
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result):
+    """URL, filename, or string --> stream
+
+    This function lets you define parsers that take any input source
+    (URL, pathname to local or network file, or actual data as a string)
+    and deal with it in a uniform manner.  Returned object is guaranteed
+    to have all the basic stdio read methods (read, readline, readlines).
+    Just .close() the object when you're done with it.
+
+    If the etag argument is supplied, it will be used as the value of an
+    If-None-Match request header.
+
+    If the modified argument is supplied, it can be a tuple of 9 integers
+    (as returned by gmtime() in the standard Python time module) or a date
+    string in any format supported by feedparser. Regardless, it MUST
+    be in GMT (Greenwich Mean Time). It will be reformatted into an
+    RFC 1123-compliant date and used as the value of an If-Modified-Since
+    request header.
+
+    If the agent argument is supplied, it will be used as the value of a
+    User-Agent request header.
+
+    If the referrer argument is supplied, it will be used as the value of a
+    Referer[sic] request header.
+
+    If handlers is supplied, it is a list of handlers used to build a
+    urllib2 opener.
+
+    if request_headers is supplied it is a dictionary of HTTP request headers
+    that will override the values generated by FeedParser.
+
+    :return: A bytes object.
+    """
+
+    if hasattr(url_file_stream_or_string, 'read'):
+        return url_file_stream_or_string.read()
+
+    if isinstance(url_file_stream_or_string, str) \
+       and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+        return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+
+    # try to open with native open function (if url_file_stream_or_string is a filename)
+    try:
+        with open(url_file_stream_or_string, 'rb') as f:
+            data = f.read()
+    except (IOError, UnicodeEncodeError, TypeError, ValueError):
+        # if url_file_stream_or_string is a str object that
+        # cannot be converted to the encoding returned by
+        # sys.getfilesystemencoding(), a UnicodeEncodeError
+        # will be thrown
+        # If url_file_stream_or_string is a string that contains NULL
+        # (such as an XML document encoded in UTF-32), TypeError will
+        # be thrown.
+        pass
+    else:
+        return data
+
+    # treat url_file_stream_or_string as string
+    if not isinstance(url_file_stream_or_string, bytes):
+        return url_file_stream_or_string.encode('utf-8')
+    return url_file_stream_or_string
+
+
+LooseFeedParser = type(
+    'LooseFeedParser',
+    (_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
+    {},
+)
+
+StrictFeedParser = type(
+    'StrictFeedParser',
+    (_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
+    {},
+)
+
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
+    """Parse a feed from a URL, file, stream, or string.
+
+    :param url_file_stream_or_string:
+        File-like object, URL, file path, or string. Both byte and text strings
+        are accepted. If necessary, encoding will be derived from the response
+        headers or automatically detected.
+
+        Note that strings may trigger network I/O or filesystem access
+        depending on the value. Wrap an untrusted string in
+        a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
+        pass untrusted strings to this function.
+
+        When a URL is not passed the feed location to use in relative URL
+        resolution should be passed in the ``Content-Location`` response header
+        (see ``response_headers`` below).
+
+    :param str etag: HTTP ``ETag`` request header.
+    :param modified: HTTP ``Last-Modified`` request header.
+    :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
+        :class:`datetime.datetime`
+    :param str agent: HTTP ``User-Agent`` request header, which defaults to
+        the value of :data:`feedparser.USER_AGENT`.
+    :param referrer: HTTP ``Referer`` [sic] request header.
+    :param request_headers:
+        A mapping of HTTP header name to HTTP header value to add to the
+        request, overriding internally generated values.
+    :type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
+    :param response_headers:
+        A mapping of HTTP header name to HTTP header value. Multiple values may
+        be joined with a comma. If a HTTP request was made, these headers
+        override any matching headers in the response. Otherwise this specifies
+        the entirety of the response headers.
+    :type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
+
+    :param bool resolve_relative_uris:
+        Should feedparser attempt to resolve relative URIs absolute ones within
+        HTML content?  Defaults to the value of
+        :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
+    :param bool sanitize_html:
+        Should feedparser skip HTML sanitization? Only disable this if you know
+        what you are doing!  Defaults to the value of
+        :data:`feedparser.SANITIZE_HTML`, which is ``True``.
+
+    :return: A :class:`FeedParserDict`.
+    """
+
+    if not agent or sanitize_html is None or resolve_relative_uris is None:
+        import feedparser
+    if not agent:
+        agent = feedparser.USER_AGENT
+    if sanitize_html is None:
+        sanitize_html = feedparser.SANITIZE_HTML
+    if resolve_relative_uris is None:
+        resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
+
+    result = FeedParserDict(
+        bozo=False,
+        entries=[],
+        feed=FeedParserDict(),
+        headers={},
+    )
+
+    try:
+        data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+    except urllib.error.URLError as error:
+        result.update({
+            'bozo': True,
+            'bozo_exception': error,
+        })
+        return result
+
+    if not data:
+        return result
+
+    # overwrite existing headers using response_headers
+    result['headers'].update(response_headers or {})
+
+    data = convert_to_utf8(result['headers'], data, result)
+    use_strict_parser = result['encoding'] and True or False
+
+    result['version'], data, entities = replace_doctype(data)
+
+    # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
+    contentloc = result['headers'].get('content-location', '')
+    href = result.get('href', '')
+    baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href
+
+    baselang = result['headers'].get('content-language', None)
+    if isinstance(baselang, bytes) and baselang is not None:
+        baselang = baselang.decode('utf-8', 'ignore')
+
+    if not _XML_AVAILABLE:
+        use_strict_parser = 0
+    if use_strict_parser:
+        # initialize the SAX parser
+        feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
+        feedparser.resolve_relative_uris = resolve_relative_uris
+        feedparser.sanitize_html = sanitize_html
+        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
+        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
+        try:
+            # disable downloading external doctype references, if possible
+            saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
+        except xml.sax.SAXNotSupportedException:
+            pass
+        saxparser.setContentHandler(feedparser)
+        saxparser.setErrorHandler(feedparser)
+        source = xml.sax.xmlreader.InputSource()
+        source.setByteStream(io.BytesIO(data))
+        try:
+            saxparser.parse(source)
+        except xml.sax.SAXException as e:
+            result['bozo'] = 1
+            result['bozo_exception'] = feedparser.exc or e
+            use_strict_parser = 0
+    if not use_strict_parser:
+        feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+        feedparser.resolve_relative_uris = resolve_relative_uris
+        feedparser.sanitize_html = sanitize_html
+        feedparser.feed(data.decode('utf-8', 'replace'))
+    result['feed'] = feedparser.feeddata
+    result['entries'] = feedparser.entries
+    result['version'] = result['version'] or feedparser.version
+    result['namespaces'] = feedparser.namespaces_in_use
+    return result