Primer commit del proyecto RSS
This commit is contained in:
commit
27c9515d29
1568 changed files with 252311 additions and 0 deletions
950
venv/lib/python3.12/site-packages/feedparser/sanitizer.py
Normal file
950
venv/lib/python3.12/site-packages/feedparser/sanitizer.py
Normal file
|
|
@ -0,0 +1,950 @@
|
|||
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
# This file is a part of feedparser.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import re
|
||||
|
||||
from .html import _BaseHTMLProcessor
|
||||
from .urls import make_safe_absolute_uri
|
||||
|
||||
|
||||
class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||
acceptable_elements = {
|
||||
'a',
|
||||
'abbr',
|
||||
'acronym',
|
||||
'address',
|
||||
'area',
|
||||
'article',
|
||||
'aside',
|
||||
'audio',
|
||||
'b',
|
||||
'big',
|
||||
'blockquote',
|
||||
'br',
|
||||
'button',
|
||||
'canvas',
|
||||
'caption',
|
||||
'center',
|
||||
'cite',
|
||||
'code',
|
||||
'col',
|
||||
'colgroup',
|
||||
'command',
|
||||
'datagrid',
|
||||
'datalist',
|
||||
'dd',
|
||||
'del',
|
||||
'details',
|
||||
'dfn',
|
||||
'dialog',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'dt',
|
||||
'em',
|
||||
'event-source',
|
||||
'fieldset',
|
||||
'figcaption',
|
||||
'figure',
|
||||
'font',
|
||||
'footer',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'header',
|
||||
'hr',
|
||||
'i',
|
||||
'img',
|
||||
'input',
|
||||
'ins',
|
||||
'kbd',
|
||||
'keygen',
|
||||
'label',
|
||||
'legend',
|
||||
'li',
|
||||
'm',
|
||||
'map',
|
||||
'menu',
|
||||
'meter',
|
||||
'multicol',
|
||||
'nav',
|
||||
'nextid',
|
||||
'noscript',
|
||||
'ol',
|
||||
'optgroup',
|
||||
'option',
|
||||
'output',
|
||||
'p',
|
||||
'pre',
|
||||
'progress',
|
||||
'q',
|
||||
's',
|
||||
'samp',
|
||||
'section',
|
||||
'select',
|
||||
'small',
|
||||
'sound',
|
||||
'source',
|
||||
'spacer',
|
||||
'span',
|
||||
'strike',
|
||||
'strong',
|
||||
'sub',
|
||||
'sup',
|
||||
'table',
|
||||
'tbody',
|
||||
'td',
|
||||
'textarea',
|
||||
'tfoot',
|
||||
'th',
|
||||
'thead',
|
||||
'time',
|
||||
'tr',
|
||||
'tt',
|
||||
'u',
|
||||
'ul',
|
||||
'var',
|
||||
'video',
|
||||
}
|
||||
|
||||
acceptable_attributes = {
|
||||
'abbr',
|
||||
'accept',
|
||||
'accept-charset',
|
||||
'accesskey',
|
||||
'action',
|
||||
'align',
|
||||
'alt',
|
||||
'autocomplete',
|
||||
'autofocus',
|
||||
'axis',
|
||||
'background',
|
||||
'balance',
|
||||
'bgcolor',
|
||||
'bgproperties',
|
||||
'border',
|
||||
'bordercolor',
|
||||
'bordercolordark',
|
||||
'bordercolorlight',
|
||||
'bottompadding',
|
||||
'cellpadding',
|
||||
'cellspacing',
|
||||
'ch',
|
||||
'challenge',
|
||||
'char',
|
||||
'charoff',
|
||||
'charset',
|
||||
'checked',
|
||||
'choff',
|
||||
'cite',
|
||||
'class',
|
||||
'clear',
|
||||
'color',
|
||||
'cols',
|
||||
'colspan',
|
||||
'compact',
|
||||
'contenteditable',
|
||||
'controls',
|
||||
'coords',
|
||||
'data',
|
||||
'datafld',
|
||||
'datapagesize',
|
||||
'datasrc',
|
||||
'datetime',
|
||||
'default',
|
||||
'delay',
|
||||
'dir',
|
||||
'disabled',
|
||||
'draggable',
|
||||
'dynsrc',
|
||||
'enctype',
|
||||
'end',
|
||||
'face',
|
||||
'for',
|
||||
'form',
|
||||
'frame',
|
||||
'galleryimg',
|
||||
'gutter',
|
||||
'headers',
|
||||
'height',
|
||||
'hidden',
|
||||
'hidefocus',
|
||||
'high',
|
||||
'href',
|
||||
'hreflang',
|
||||
'hspace',
|
||||
'icon',
|
||||
'id',
|
||||
'inputmode',
|
||||
'ismap',
|
||||
'keytype',
|
||||
'label',
|
||||
'lang',
|
||||
'leftspacing',
|
||||
'list',
|
||||
'longdesc',
|
||||
'loop',
|
||||
'loopcount',
|
||||
'loopend',
|
||||
'loopstart',
|
||||
'low',
|
||||
'lowsrc',
|
||||
'max',
|
||||
'maxlength',
|
||||
'media',
|
||||
'method',
|
||||
'min',
|
||||
'multiple',
|
||||
'name',
|
||||
'nohref',
|
||||
'noshade',
|
||||
'nowrap',
|
||||
'open',
|
||||
'optimum',
|
||||
'pattern',
|
||||
'ping',
|
||||
'point-size',
|
||||
'poster',
|
||||
'pqg',
|
||||
'preload',
|
||||
'prompt',
|
||||
'radiogroup',
|
||||
'readonly',
|
||||
'rel',
|
||||
'repeat-max',
|
||||
'repeat-min',
|
||||
'replace',
|
||||
'required',
|
||||
'rev',
|
||||
'rightspacing',
|
||||
'rows',
|
||||
'rowspan',
|
||||
'rules',
|
||||
'scope',
|
||||
'selected',
|
||||
'shape',
|
||||
'size',
|
||||
'span',
|
||||
'src',
|
||||
'start',
|
||||
'step',
|
||||
'style',
|
||||
'summary',
|
||||
'suppress',
|
||||
'tabindex',
|
||||
'target',
|
||||
'template',
|
||||
'title',
|
||||
'toppadding',
|
||||
'type',
|
||||
'unselectable',
|
||||
'urn',
|
||||
'usemap',
|
||||
'valign',
|
||||
'value',
|
||||
'variable',
|
||||
'volume',
|
||||
'vrml',
|
||||
'vspace',
|
||||
'width',
|
||||
'wrap',
|
||||
'xml:lang',
|
||||
}
|
||||
|
||||
unacceptable_elements_with_end_tag = {
|
||||
'applet',
|
||||
'script',
|
||||
'style',
|
||||
}
|
||||
|
||||
acceptable_css_properties = {
|
||||
'azimuth',
|
||||
'background-color',
|
||||
'border-bottom-color',
|
||||
'border-collapse',
|
||||
'border-color',
|
||||
'border-left-color',
|
||||
'border-right-color',
|
||||
'border-top-color',
|
||||
'clear',
|
||||
'color',
|
||||
'cursor',
|
||||
'direction',
|
||||
'display',
|
||||
'elevation',
|
||||
'float',
|
||||
'font',
|
||||
'font-family',
|
||||
'font-size',
|
||||
'font-style',
|
||||
'font-variant',
|
||||
'font-weight',
|
||||
'height',
|
||||
'letter-spacing',
|
||||
'line-height',
|
||||
'overflow',
|
||||
'pause',
|
||||
'pause-after',
|
||||
'pause-before',
|
||||
'pitch',
|
||||
'pitch-range',
|
||||
'richness',
|
||||
'speak',
|
||||
'speak-header',
|
||||
'speak-numeral',
|
||||
'speak-punctuation',
|
||||
'speech-rate',
|
||||
'stress',
|
||||
'text-align',
|
||||
'text-decoration',
|
||||
'text-indent',
|
||||
'unicode-bidi',
|
||||
'vertical-align',
|
||||
'voice-family',
|
||||
'volume',
|
||||
'white-space',
|
||||
'width',
|
||||
}
|
||||
|
||||
# survey of common keywords found in feeds
|
||||
acceptable_css_keywords = {
|
||||
'!important',
|
||||
'aqua',
|
||||
'auto',
|
||||
'black',
|
||||
'block',
|
||||
'blue',
|
||||
'bold',
|
||||
'both',
|
||||
'bottom',
|
||||
'brown',
|
||||
'center',
|
||||
'collapse',
|
||||
'dashed',
|
||||
'dotted',
|
||||
'fuchsia',
|
||||
'gray',
|
||||
'green',
|
||||
'italic',
|
||||
'left',
|
||||
'lime',
|
||||
'maroon',
|
||||
'medium',
|
||||
'navy',
|
||||
'none',
|
||||
'normal',
|
||||
'nowrap',
|
||||
'olive',
|
||||
'pointer',
|
||||
'purple',
|
||||
'red',
|
||||
'right',
|
||||
'silver',
|
||||
'solid',
|
||||
'teal',
|
||||
'top',
|
||||
'transparent',
|
||||
'underline',
|
||||
'white',
|
||||
'yellow',
|
||||
}
|
||||
|
||||
valid_css_values = re.compile(
|
||||
r'^('
|
||||
r'#[0-9a-f]+' # Hex values
|
||||
r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values
|
||||
r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths
|
||||
r')$'
|
||||
)
|
||||
|
||||
mathml_elements = {
|
||||
'annotation',
|
||||
'annotation-xml',
|
||||
'maction',
|
||||
'maligngroup',
|
||||
'malignmark',
|
||||
'math',
|
||||
'menclose',
|
||||
'merror',
|
||||
'mfenced',
|
||||
'mfrac',
|
||||
'mglyph',
|
||||
'mi',
|
||||
'mlabeledtr',
|
||||
'mlongdiv',
|
||||
'mmultiscripts',
|
||||
'mn',
|
||||
'mo',
|
||||
'mover',
|
||||
'mpadded',
|
||||
'mphantom',
|
||||
'mprescripts',
|
||||
'mroot',
|
||||
'mrow',
|
||||
'ms',
|
||||
'mscarries',
|
||||
'mscarry',
|
||||
'msgroup',
|
||||
'msline',
|
||||
'mspace',
|
||||
'msqrt',
|
||||
'msrow',
|
||||
'mstack',
|
||||
'mstyle',
|
||||
'msub',
|
||||
'msubsup',
|
||||
'msup',
|
||||
'mtable',
|
||||
'mtd',
|
||||
'mtext',
|
||||
'mtr',
|
||||
'munder',
|
||||
'munderover',
|
||||
'none',
|
||||
'semantics',
|
||||
}
|
||||
|
||||
mathml_attributes = {
|
||||
'accent',
|
||||
'accentunder',
|
||||
'actiontype',
|
||||
'align',
|
||||
'alignmentscope',
|
||||
'altimg',
|
||||
'altimg-height',
|
||||
'altimg-valign',
|
||||
'altimg-width',
|
||||
'alttext',
|
||||
'bevelled',
|
||||
'charalign',
|
||||
'close',
|
||||
'columnalign',
|
||||
'columnlines',
|
||||
'columnspacing',
|
||||
'columnspan',
|
||||
'columnwidth',
|
||||
'crossout',
|
||||
'decimalpoint',
|
||||
'denomalign',
|
||||
'depth',
|
||||
'dir',
|
||||
'display',
|
||||
'displaystyle',
|
||||
'edge',
|
||||
'encoding',
|
||||
'equalcolumns',
|
||||
'equalrows',
|
||||
'fence',
|
||||
'fontstyle',
|
||||
'fontweight',
|
||||
'form',
|
||||
'frame',
|
||||
'framespacing',
|
||||
'groupalign',
|
||||
'height',
|
||||
'href',
|
||||
'id',
|
||||
'indentalign',
|
||||
'indentalignfirst',
|
||||
'indentalignlast',
|
||||
'indentshift',
|
||||
'indentshiftfirst',
|
||||
'indentshiftlast',
|
||||
'indenttarget',
|
||||
'infixlinebreakstyle',
|
||||
'largeop',
|
||||
'length',
|
||||
'linebreak',
|
||||
'linebreakmultchar',
|
||||
'linebreakstyle',
|
||||
'lineleading',
|
||||
'linethickness',
|
||||
'location',
|
||||
'longdivstyle',
|
||||
'lquote',
|
||||
'lspace',
|
||||
'mathbackground',
|
||||
'mathcolor',
|
||||
'mathsize',
|
||||
'mathvariant',
|
||||
'maxsize',
|
||||
'minlabelspacing',
|
||||
'minsize',
|
||||
'movablelimits',
|
||||
'notation',
|
||||
'numalign',
|
||||
'open',
|
||||
'other',
|
||||
'overflow',
|
||||
'position',
|
||||
'rowalign',
|
||||
'rowlines',
|
||||
'rowspacing',
|
||||
'rowspan',
|
||||
'rquote',
|
||||
'rspace',
|
||||
'scriptlevel',
|
||||
'scriptminsize',
|
||||
'scriptsizemultiplier',
|
||||
'selection',
|
||||
'separator',
|
||||
'separators',
|
||||
'shift',
|
||||
'side',
|
||||
'src',
|
||||
'stackalign',
|
||||
'stretchy',
|
||||
'subscriptshift',
|
||||
'superscriptshift',
|
||||
'symmetric',
|
||||
'voffset',
|
||||
'width',
|
||||
'xlink:href',
|
||||
'xlink:show',
|
||||
'xlink:type',
|
||||
'xmlns',
|
||||
'xmlns:xlink',
|
||||
}
|
||||
|
||||
# svgtiny - foreignObject + linearGradient + radialGradient + stop
|
||||
svg_elements = {
|
||||
'a',
|
||||
'animate',
|
||||
'animateColor',
|
||||
'animateMotion',
|
||||
'animateTransform',
|
||||
'circle',
|
||||
'defs',
|
||||
'desc',
|
||||
'ellipse',
|
||||
'font-face',
|
||||
'font-face-name',
|
||||
'font-face-src',
|
||||
'foreignObject',
|
||||
'g',
|
||||
'glyph',
|
||||
'hkern',
|
||||
'line',
|
||||
'linearGradient',
|
||||
'marker',
|
||||
'metadata',
|
||||
'missing-glyph',
|
||||
'mpath',
|
||||
'path',
|
||||
'polygon',
|
||||
'polyline',
|
||||
'radialGradient',
|
||||
'rect',
|
||||
'set',
|
||||
'stop',
|
||||
'svg',
|
||||
'switch',
|
||||
'text',
|
||||
'title',
|
||||
'tspan',
|
||||
'use',
|
||||
}
|
||||
|
||||
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
||||
svg_attributes = {
|
||||
'accent-height',
|
||||
'accumulate',
|
||||
'additive',
|
||||
'alphabetic',
|
||||
'arabic-form',
|
||||
'ascent',
|
||||
'attributeName',
|
||||
'attributeType',
|
||||
'baseProfile',
|
||||
'bbox',
|
||||
'begin',
|
||||
'by',
|
||||
'calcMode',
|
||||
'cap-height',
|
||||
'class',
|
||||
'color',
|
||||
'color-rendering',
|
||||
'content',
|
||||
'cx',
|
||||
'cy',
|
||||
'd',
|
||||
'descent',
|
||||
'display',
|
||||
'dur',
|
||||
'dx',
|
||||
'dy',
|
||||
'end',
|
||||
'fill',
|
||||
'fill-opacity',
|
||||
'fill-rule',
|
||||
'font-family',
|
||||
'font-size',
|
||||
'font-stretch',
|
||||
'font-style',
|
||||
'font-variant',
|
||||
'font-weight',
|
||||
'from',
|
||||
'fx',
|
||||
'fy',
|
||||
'g1',
|
||||
'g2',
|
||||
'glyph-name',
|
||||
'gradientUnits',
|
||||
'hanging',
|
||||
'height',
|
||||
'horiz-adv-x',
|
||||
'horiz-origin-x',
|
||||
'id',
|
||||
'ideographic',
|
||||
'k',
|
||||
'keyPoints',
|
||||
'keySplines',
|
||||
'keyTimes',
|
||||
'lang',
|
||||
'marker-end',
|
||||
'marker-mid',
|
||||
'marker-start',
|
||||
'markerHeight',
|
||||
'markerUnits',
|
||||
'markerWidth',
|
||||
'mathematical',
|
||||
'max',
|
||||
'min',
|
||||
'name',
|
||||
'offset',
|
||||
'opacity',
|
||||
'orient',
|
||||
'origin',
|
||||
'overline-position',
|
||||
'overline-thickness',
|
||||
'panose-1',
|
||||
'path',
|
||||
'pathLength',
|
||||
'points',
|
||||
'preserveAspectRatio',
|
||||
'r',
|
||||
'refX',
|
||||
'refY',
|
||||
'repeatCount',
|
||||
'repeatDur',
|
||||
'requiredExtensions',
|
||||
'requiredFeatures',
|
||||
'restart',
|
||||
'rotate',
|
||||
'rx',
|
||||
'ry',
|
||||
'slope',
|
||||
'stemh',
|
||||
'stemv',
|
||||
'stop-color',
|
||||
'stop-opacity',
|
||||
'strikethrough-position',
|
||||
'strikethrough-thickness',
|
||||
'stroke',
|
||||
'stroke-dasharray',
|
||||
'stroke-dashoffset',
|
||||
'stroke-linecap',
|
||||
'stroke-linejoin',
|
||||
'stroke-miterlimit',
|
||||
'stroke-opacity',
|
||||
'stroke-width',
|
||||
'systemLanguage',
|
||||
'target',
|
||||
'text-anchor',
|
||||
'to',
|
||||
'transform',
|
||||
'type',
|
||||
'u1',
|
||||
'u2',
|
||||
'underline-position',
|
||||
'underline-thickness',
|
||||
'unicode',
|
||||
'unicode-range',
|
||||
'units-per-em',
|
||||
'values',
|
||||
'version',
|
||||
'viewBox',
|
||||
'visibility',
|
||||
'width',
|
||||
'widths',
|
||||
'x',
|
||||
'x-height',
|
||||
'x1',
|
||||
'x2',
|
||||
'xlink:actuate',
|
||||
'xlink:arcrole',
|
||||
'xlink:href',
|
||||
'xlink:role',
|
||||
'xlink:show',
|
||||
'xlink:title',
|
||||
'xlink:type',
|
||||
'xml:base',
|
||||
'xml:lang',
|
||||
'xml:space',
|
||||
'xmlns',
|
||||
'xmlns:xlink',
|
||||
'y',
|
||||
'y1',
|
||||
'y2',
|
||||
'zoomAndPan',
|
||||
}
|
||||
|
||||
svg_attr_map = None
|
||||
svg_elem_map = None
|
||||
|
||||
acceptable_svg_properties = {
|
||||
'fill',
|
||||
'fill-opacity',
|
||||
'fill-rule',
|
||||
'stroke',
|
||||
'stroke-linecap',
|
||||
'stroke-linejoin',
|
||||
'stroke-opacity',
|
||||
'stroke-width',
|
||||
}
|
||||
|
||||
def __init__(self, encoding=None, _type='application/xhtml+xml'):
|
||||
super(_HTMLSanitizer, self).__init__(encoding, _type)
|
||||
|
||||
self.unacceptablestack = 0
|
||||
self.mathmlOK = 0
|
||||
self.svgOK = 0
|
||||
|
||||
def reset(self):
|
||||
super(_HTMLSanitizer, self).reset()
|
||||
self.unacceptablestack = 0
|
||||
self.mathmlOK = 0
|
||||
self.svgOK = 0
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
acceptable_attributes = self.acceptable_attributes
|
||||
keymap = {}
|
||||
if tag not in self.acceptable_elements or self.svgOK:
|
||||
if tag in self.unacceptable_elements_with_end_tag:
|
||||
self.unacceptablestack += 1
|
||||
|
||||
# add implicit namespaces to html5 inline svg/mathml
|
||||
if self._type.endswith('html'):
|
||||
if not dict(attrs).get('xmlns'):
|
||||
if tag == 'svg':
|
||||
attrs.append(('xmlns', 'http://www.w3.org/2000/svg'))
|
||||
if tag == 'math':
|
||||
attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML'))
|
||||
|
||||
# not otherwise acceptable, perhaps it is MathML or SVG?
|
||||
if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs:
|
||||
self.mathmlOK += 1
|
||||
if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs:
|
||||
self.svgOK += 1
|
||||
|
||||
# chose acceptable attributes based on tag class, else bail
|
||||
if self.mathmlOK and tag in self.mathml_elements:
|
||||
acceptable_attributes = self.mathml_attributes
|
||||
elif self.svgOK and tag in self.svg_elements:
|
||||
# For most vocabularies, lowercasing is a good idea. Many
|
||||
# svg elements, however, are camel case.
|
||||
if not self.svg_attr_map:
|
||||
lower = [attr.lower() for attr in self.svg_attributes]
|
||||
mix = [a for a in self.svg_attributes if a not in lower]
|
||||
self.svg_attributes = lower
|
||||
self.svg_attr_map = {a.lower(): a for a in mix}
|
||||
|
||||
lower = [attr.lower() for attr in self.svg_elements]
|
||||
mix = [a for a in self.svg_elements if a not in lower]
|
||||
self.svg_elements = lower
|
||||
self.svg_elem_map = {a.lower(): a for a in mix}
|
||||
acceptable_attributes = self.svg_attributes
|
||||
tag = self.svg_elem_map.get(tag, tag)
|
||||
keymap = self.svg_attr_map
|
||||
elif tag not in self.acceptable_elements:
|
||||
return
|
||||
|
||||
# declare xlink namespace, if needed
|
||||
if self.mathmlOK or self.svgOK:
|
||||
if any((a for a in attrs if a[0].startswith('xlink:'))):
|
||||
if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs:
|
||||
attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink'))
|
||||
|
||||
clean_attrs = []
|
||||
for key, value in self.normalize_attrs(attrs):
|
||||
if key == 'style' and 'style' in acceptable_attributes:
|
||||
clean_value = self.sanitize_style(value)
|
||||
if clean_value:
|
||||
clean_attrs.append((key, clean_value))
|
||||
elif key in acceptable_attributes:
|
||||
key = keymap.get(key, key)
|
||||
# make sure the uri uses an acceptable uri scheme
|
||||
if key == 'href':
|
||||
value = make_safe_absolute_uri(value)
|
||||
clean_attrs.append((key, value))
|
||||
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
if tag not in self.acceptable_elements:
|
||||
if tag in self.unacceptable_elements_with_end_tag:
|
||||
self.unacceptablestack -= 1
|
||||
if self.mathmlOK and tag in self.mathml_elements:
|
||||
if tag == 'math' and self.mathmlOK:
|
||||
self.mathmlOK -= 1
|
||||
elif self.svgOK and tag in self.svg_elements:
|
||||
tag = self.svg_elem_map.get(tag, tag)
|
||||
if tag == 'svg' and self.svgOK:
|
||||
self.svgOK -= 1
|
||||
else:
|
||||
return
|
||||
super(_HTMLSanitizer, self).unknown_endtag(tag)
|
||||
|
||||
def handle_pi(self, text):
|
||||
pass
|
||||
|
||||
def handle_decl(self, text):
|
||||
pass
|
||||
|
||||
def handle_data(self, text):
|
||||
if not self.unacceptablestack:
|
||||
super(_HTMLSanitizer, self).handle_data(text)
|
||||
|
||||
def sanitize_style(self, style):
|
||||
# disallow urls
|
||||
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||
return ''
|
||||
# This replaced a regexp that used re.match and was prone to
|
||||
# pathological back-tracking.
|
||||
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.acceptable_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']:
|
||||
for keyword in value.split():
|
||||
if (
|
||||
keyword not in self.acceptable_css_keywords
|
||||
and not self.valid_css_values.match(keyword)
|
||||
):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
def parse_comment(self, i, report=1):
|
||||
ret = super(_HTMLSanitizer, self).parse_comment(i, report)
|
||||
if ret >= 0:
|
||||
return ret
|
||||
# if ret == -1, this may be a malicious attempt to circumvent
|
||||
# sanitization, or a page-destroying unclosed comment
|
||||
match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
|
||||
if match:
|
||||
return match.end()
|
||||
# unclosed comment; deliberately fail to handle_data()
|
||||
return len(self.rawdata)
|
||||
|
||||
|
||||
def _sanitize_html(html_source, encoding, _type):
|
||||
p = _HTMLSanitizer(encoding, _type)
|
||||
html_source = html_source.replace('<![CDATA[', '<![CDATA[')
|
||||
p.feed(html_source)
|
||||
data = p.output()
|
||||
data = data.strip().replace('\r\n', '\n')
|
||||
return data
|
||||
|
||||
|
||||
# Match XML entity declarations.
|
||||
# Example: <!ENTITY copyright "(C)">
|
||||
RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
|
||||
|
||||
# Match XML DOCTYPE declarations.
|
||||
# Example: <!DOCTYPE feed [ ]>
|
||||
RE_DOCTYPE_PATTERN = re.compile(br'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
|
||||
|
||||
# Match safe entity declarations.
|
||||
# This will allow hexadecimal character references through,
|
||||
# as well as text, but not arbitrary nested entities.
|
||||
# Example: cubed "³"
|
||||
# Example: copyright "(C)"
|
||||
# Forbidden: explode1 "&explode2;&explode2;"
|
||||
RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
|
||||
|
||||
|
||||
def replace_doctype(data):
|
||||
"""Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
|
||||
|
||||
rss_version may be 'rss091n' or None
|
||||
stripped_data is the same XML document with a replaced DOCTYPE
|
||||
"""
|
||||
|
||||
# Divide the document into two groups by finding the location
|
||||
# of the first element that doesn't begin with '<?' or '<!'.
|
||||
start = re.search(br'<\w', data)
|
||||
start = start and start.start() or -1
|
||||
head, data = data[:start+1], data[start+1:]
|
||||
|
||||
# Save and then remove all of the ENTITY declarations.
|
||||
entity_results = RE_ENTITY_PATTERN.findall(head)
|
||||
head = RE_ENTITY_PATTERN.sub(b'', head)
|
||||
|
||||
# Find the DOCTYPE declaration and check the feed type.
|
||||
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
|
||||
doctype = doctype_results and doctype_results[0] or b''
|
||||
if b'netscape' in doctype.lower():
|
||||
version = 'rss091n'
|
||||
else:
|
||||
version = None
|
||||
|
||||
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
|
||||
replacement = b''
|
||||
if len(doctype_results) == 1 and entity_results:
|
||||
safe_entities = [
|
||||
e
|
||||
for e in entity_results
|
||||
if RE_SAFE_ENTITY_PATTERN.match(e)
|
||||
]
|
||||
if safe_entities:
|
||||
replacement = b'<!DOCTYPE feed [\n<!ENTITY' \
|
||||
+ b'>\n<!ENTITY '.join(safe_entities) \
|
||||
+ b'>\n]>'
|
||||
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
|
||||
|
||||
# Precompute the safe entities for the loose parser.
|
||||
safe_entities = {
|
||||
k.decode('utf-8'): v.decode('utf-8')
|
||||
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
|
||||
}
|
||||
return version, data, safe_entities
|
||||
Loading…
Add table
Add a link
Reference in a new issue