Primer commit del proyecto RSS
This commit is contained in:
commit
27c9515d29
1568 changed files with 252311 additions and 0 deletions
277
venv/lib/python3.12/site-packages/feedparser/api.py
Normal file
277
venv/lib/python3.12/site-packages/feedparser/api.py
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
# The public API for feedparser
|
||||
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
# This file is a part of feedparser.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import io
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import xml.sax
|
||||
|
||||
from .datetimes import registerDateHandler, _parse_date
|
||||
from .encodings import convert_to_utf8
|
||||
from .exceptions import *
|
||||
from .html import _BaseHTMLProcessor
|
||||
from . import http
|
||||
from . import mixin
|
||||
from .mixin import _FeedParserMixin
|
||||
from .parsers.loose import _LooseFeedParser
|
||||
from .parsers.strict import _StrictFeedParser
|
||||
from .sanitizer import replace_doctype
|
||||
from .sgml import *
|
||||
from .urls import convert_to_idn, make_safe_absolute_uri
|
||||
from .util import FeedParserDict
|
||||
|
||||
|
||||
# List of preferred XML parsers, by SAX driver name. These will be tried first,
|
||||
# but if they're not installed, Python will keep searching through its own list
|
||||
# of pre-installed parsers until it finds one that supports everything we need.
|
||||
PREFERRED_XML_PARSERS = ["drv_libxml2"]
|
||||
|
||||
_XML_AVAILABLE = True
|
||||
|
||||
SUPPORTED_VERSIONS = {
|
||||
'': 'unknown',
|
||||
'rss090': 'RSS 0.90',
|
||||
'rss091n': 'RSS 0.91 (Netscape)',
|
||||
'rss091u': 'RSS 0.91 (Userland)',
|
||||
'rss092': 'RSS 0.92',
|
||||
'rss093': 'RSS 0.93',
|
||||
'rss094': 'RSS 0.94',
|
||||
'rss20': 'RSS 2.0',
|
||||
'rss10': 'RSS 1.0',
|
||||
'rss': 'RSS (unknown version)',
|
||||
'atom01': 'Atom 0.1',
|
||||
'atom02': 'Atom 0.2',
|
||||
'atom03': 'Atom 0.3',
|
||||
'atom10': 'Atom 1.0',
|
||||
'atom': 'Atom (unknown version)',
|
||||
'cdf': 'CDF',
|
||||
}
|
||||
|
||||
|
||||
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result):
|
||||
"""URL, filename, or string --> stream
|
||||
|
||||
This function lets you define parsers that take any input source
|
||||
(URL, pathname to local or network file, or actual data as a string)
|
||||
and deal with it in a uniform manner. Returned object is guaranteed
|
||||
to have all the basic stdio read methods (read, readline, readlines).
|
||||
Just .close() the object when you're done with it.
|
||||
|
||||
If the etag argument is supplied, it will be used as the value of an
|
||||
If-None-Match request header.
|
||||
|
||||
If the modified argument is supplied, it can be a tuple of 9 integers
|
||||
(as returned by gmtime() in the standard Python time module) or a date
|
||||
string in any format supported by feedparser. Regardless, it MUST
|
||||
be in GMT (Greenwich Mean Time). It will be reformatted into an
|
||||
RFC 1123-compliant date and used as the value of an If-Modified-Since
|
||||
request header.
|
||||
|
||||
If the agent argument is supplied, it will be used as the value of a
|
||||
User-Agent request header.
|
||||
|
||||
If the referrer argument is supplied, it will be used as the value of a
|
||||
Referer[sic] request header.
|
||||
|
||||
If handlers is supplied, it is a list of handlers used to build a
|
||||
urllib2 opener.
|
||||
|
||||
if request_headers is supplied it is a dictionary of HTTP request headers
|
||||
that will override the values generated by FeedParser.
|
||||
|
||||
:return: A bytes object.
|
||||
"""
|
||||
|
||||
if hasattr(url_file_stream_or_string, 'read'):
|
||||
return url_file_stream_or_string.read()
|
||||
|
||||
if isinstance(url_file_stream_or_string, str) \
|
||||
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
|
||||
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
||||
|
||||
# try to open with native open function (if url_file_stream_or_string is a filename)
|
||||
try:
|
||||
with open(url_file_stream_or_string, 'rb') as f:
|
||||
data = f.read()
|
||||
except (IOError, UnicodeEncodeError, TypeError, ValueError):
|
||||
# if url_file_stream_or_string is a str object that
|
||||
# cannot be converted to the encoding returned by
|
||||
# sys.getfilesystemencoding(), a UnicodeEncodeError
|
||||
# will be thrown
|
||||
# If url_file_stream_or_string is a string that contains NULL
|
||||
# (such as an XML document encoded in UTF-32), TypeError will
|
||||
# be thrown.
|
||||
pass
|
||||
else:
|
||||
return data
|
||||
|
||||
# treat url_file_stream_or_string as string
|
||||
if not isinstance(url_file_stream_or_string, bytes):
|
||||
return url_file_stream_or_string.encode('utf-8')
|
||||
return url_file_stream_or_string
|
||||
|
||||
|
||||
LooseFeedParser = type(
|
||||
'LooseFeedParser',
|
||||
(_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
|
||||
{},
|
||||
)
|
||||
|
||||
StrictFeedParser = type(
|
||||
'StrictFeedParser',
|
||||
(_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
|
||||
{},
|
||||
)
|
||||
|
||||
|
||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
|
||||
"""Parse a feed from a URL, file, stream, or string.
|
||||
|
||||
:param url_file_stream_or_string:
|
||||
File-like object, URL, file path, or string. Both byte and text strings
|
||||
are accepted. If necessary, encoding will be derived from the response
|
||||
headers or automatically detected.
|
||||
|
||||
Note that strings may trigger network I/O or filesystem access
|
||||
depending on the value. Wrap an untrusted string in
|
||||
a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
|
||||
pass untrusted strings to this function.
|
||||
|
||||
When a URL is not passed the feed location to use in relative URL
|
||||
resolution should be passed in the ``Content-Location`` response header
|
||||
(see ``response_headers`` below).
|
||||
|
||||
:param str etag: HTTP ``ETag`` request header.
|
||||
:param modified: HTTP ``Last-Modified`` request header.
|
||||
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
|
||||
:class:`datetime.datetime`
|
||||
:param str agent: HTTP ``User-Agent`` request header, which defaults to
|
||||
the value of :data:`feedparser.USER_AGENT`.
|
||||
:param referrer: HTTP ``Referer`` [sic] request header.
|
||||
:param request_headers:
|
||||
A mapping of HTTP header name to HTTP header value to add to the
|
||||
request, overriding internally generated values.
|
||||
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
:param response_headers:
|
||||
A mapping of HTTP header name to HTTP header value. Multiple values may
|
||||
be joined with a comma. If a HTTP request was made, these headers
|
||||
override any matching headers in the response. Otherwise this specifies
|
||||
the entirety of the response headers.
|
||||
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
|
||||
:param bool resolve_relative_uris:
|
||||
Should feedparser attempt to resolve relative URIs absolute ones within
|
||||
HTML content? Defaults to the value of
|
||||
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
||||
:param bool sanitize_html:
|
||||
Should feedparser skip HTML sanitization? Only disable this if you know
|
||||
what you are doing! Defaults to the value of
|
||||
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
||||
|
||||
:return: A :class:`FeedParserDict`.
|
||||
"""
|
||||
|
||||
if not agent or sanitize_html is None or resolve_relative_uris is None:
|
||||
import feedparser
|
||||
if not agent:
|
||||
agent = feedparser.USER_AGENT
|
||||
if sanitize_html is None:
|
||||
sanitize_html = feedparser.SANITIZE_HTML
|
||||
if resolve_relative_uris is None:
|
||||
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
|
||||
|
||||
result = FeedParserDict(
|
||||
bozo=False,
|
||||
entries=[],
|
||||
feed=FeedParserDict(),
|
||||
headers={},
|
||||
)
|
||||
|
||||
try:
|
||||
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
||||
except urllib.error.URLError as error:
|
||||
result.update({
|
||||
'bozo': True,
|
||||
'bozo_exception': error,
|
||||
})
|
||||
return result
|
||||
|
||||
if not data:
|
||||
return result
|
||||
|
||||
# overwrite existing headers using response_headers
|
||||
result['headers'].update(response_headers or {})
|
||||
|
||||
data = convert_to_utf8(result['headers'], data, result)
|
||||
use_strict_parser = result['encoding'] and True or False
|
||||
|
||||
result['version'], data, entities = replace_doctype(data)
|
||||
|
||||
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
|
||||
contentloc = result['headers'].get('content-location', '')
|
||||
href = result.get('href', '')
|
||||
baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href
|
||||
|
||||
baselang = result['headers'].get('content-language', None)
|
||||
if isinstance(baselang, bytes) and baselang is not None:
|
||||
baselang = baselang.decode('utf-8', 'ignore')
|
||||
|
||||
if not _XML_AVAILABLE:
|
||||
use_strict_parser = 0
|
||||
if use_strict_parser:
|
||||
# initialize the SAX parser
|
||||
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
||||
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||
try:
|
||||
# disable downloading external doctype references, if possible
|
||||
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
|
||||
except xml.sax.SAXNotSupportedException:
|
||||
pass
|
||||
saxparser.setContentHandler(feedparser)
|
||||
saxparser.setErrorHandler(feedparser)
|
||||
source = xml.sax.xmlreader.InputSource()
|
||||
source.setByteStream(io.BytesIO(data))
|
||||
try:
|
||||
saxparser.parse(source)
|
||||
except xml.sax.SAXException as e:
|
||||
result['bozo'] = 1
|
||||
result['bozo_exception'] = feedparser.exc or e
|
||||
use_strict_parser = 0
|
||||
if not use_strict_parser:
|
||||
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
feedparser.feed(data.decode('utf-8', 'replace'))
|
||||
result['feed'] = feedparser.feeddata
|
||||
result['entries'] = feedparser.entries
|
||||
result['version'] = result['version'] or feedparser.version
|
||||
result['namespaces'] = feedparser.namespaces_in_use
|
||||
return result
|
||||
Loading…
Add table
Add a link
Reference in a new issue