Commit 40823adc authored by Andrew Lee's avatar Andrew Lee

Merge branch 'collabora' into 'collabora'

obs-api: embed sanitize-4.0.0 to fix runtime issue

See merge request !1
parents 3a1fe003 bb9d59ac
open-build-service (2.7.1-10co6) tools; urgency=medium
* Embed sanitize 4.0.0 ruby gem to fix breakeage. (T7297)
- add obs-api runtime depends on ruby-nokogumbo and ruby-crass.
-- Héctor Orón Martínez <hector.oron@collabora.co.uk> Thu, 19 Oct 2017 16:27:27 +0200
open-build-service (2.7.1-10co5) tools; urgency=medium
* Remove stray debug print from backend-Handle-ERROR_WANT_-READ-WRITE-from-ssl-reads.patch
......
......@@ -161,6 +161,7 @@ Depends: apache2,
ruby-codemirror-rails,
ruby-coderay,
ruby-crack,
ruby-crass,
ruby-cssmin,
ruby-daemons,
ruby-dalli,
......@@ -189,6 +190,7 @@ Depends: apache2,
ruby-mime-types,
ruby-mysql2,
ruby-nokogiri,
ruby-nokogumbo,
ruby-parser,
ruby-pkg-config (>= 1.1.6),
ruby-pundit,
......
# encoding: utf-8
require 'nokogumbo'
require 'set'
require_relative 'sanitize/version'
require_relative 'sanitize/config'
require_relative 'sanitize/config/default'
require_relative 'sanitize/config/restricted'
require_relative 'sanitize/config/basic'
require_relative 'sanitize/config/relaxed'
require_relative 'sanitize/css'
require_relative 'sanitize/transformers/clean_cdata'
require_relative 'sanitize/transformers/clean_comment'
require_relative 'sanitize/transformers/clean_css'
require_relative 'sanitize/transformers/clean_doctype'
require_relative 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
# Matches Unicode characters that should be stripped from HTML before passing
# it to the parser.
#
# http://www.w3.org/TR/unicode-xml/#Charlist
REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
#--
# Class Methods
#++
# Returns a sanitized copy of the given full _html_ document, using the
# settings in _config_ if specified.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def self.document(html, config = {})
Sanitize.new(config).document(html)
end
# Returns a sanitized copy of the given _html_ fragment, using the settings in
# _config_ if specified.
def self.fragment(html, config = {})
Sanitize.new(config).fragment(html)
end
# Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
def self.node!(node, config = {})
Sanitize.new(config).node!(node)
end
# Aliases for pre-3.0.0 backcompat.
class << Sanitize
# @deprecated Use {.document} instead.
alias_method :clean_document, :document
# @deprecated Use {.fragment} instead.
alias_method :clean, :fragment
# @deprecated Use {.node!} instead.
alias_method :clean_node!, :node!
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT, config)
@transformers = Array(@config[:transformers].dup)
# Default transformers always run at the end of the chain, after any custom
# transformers.
@transformers << Transformers::CleanComment unless @config[:allow_comments]
@transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
if @config[:elements].include?('style')
scss = Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanElement.new(scss)
end
if @config[:attributes].values.any? {|attr| attr.include?('style') }
scss ||= Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanAttribute.new(scss)
end
@transformers <<
Transformers::CleanCDATA <<
Transformers::CleanElement.new(@config)
end
# Returns a sanitized copy of the given _html_ document.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def document(html)
return '' unless html
doc = Nokogiri::HTML5.parse(preprocess(html))
node!(doc)
to_html(doc)
end
# @deprecated Use {#document} instead.
alias_method :clean_document, :document
# Returns a sanitized copy of the given _html_ fragment.
def fragment(html)
return '' unless html
html = preprocess(html)
doc = Nokogiri::HTML5.parse("<html><body>#{html}")
# Hack to allow fragments containing <body>. Borrowed from
# Nokogiri::HTML::DocumentFragment.
if html =~ /\A<body(?:\s|>)/i
path = '/html/body'
else
path = '/html/body/node()'
end
frag = doc.fragment
frag << doc.xpath(path)
node!(frag)
to_html(frag)
end
# @deprecated Use {#fragment} instead.
alias_method :clean, :fragment
# Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
# in place.
#
# If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
# whitelisted or an error will be raised.
def node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
if node.is_a?(Nokogiri::XML::Document)
unless @config[:elements].include?('html')
raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
end
end
node_whitelist = Set.new
traverse(node) do |n|
transform_node!(n, node_whitelist)
end
node
end
# @deprecated Use {#node!} instead.
alias_method :clean_node!, :node!
private
# Preprocesses HTML before parsing to remove undesirable Unicode chars.
def preprocess(html)
html = html.to_s.dup
unless html.encoding.name == 'UTF-8'
html.encode!('UTF-8',
:invalid => :replace,
:undef => :replace)
end
html.gsub!(REGEX_UNSUITABLE_CHARS, '')
html
end
def to_html(node)
replace_meta = false
# Hacky workaround for a libxml2 bug that adds an undesired Content-Type
# meta tag to all serialized HTML documents.
#
# https://github.com/sparklemotion/nokogiri/issues/1008
if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
# Only replace the content-type meta tag if <meta> isn't whitelisted or
# the original document didn't actually include a content-type meta tag.
replace_meta = !@config[:elements].include?('meta') ||
node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
meta['http-equiv'].downcase == 'content-type'
end
end
so = Nokogiri::XML::Node::SaveOptions
# Serialize to HTML without any formatting to prevent Nokogiri from adding
# newlines after certain tags.
html = node.to_html(
:encoding => 'utf-8',
:indent => 0,
:save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
)
html.gsub!(regex_meta, '\1') if replace_meta
html
end
def transform_node!(node, node_whitelist)
@transformers.each do |transformer|
result = transformer.call(
:config => @config,
:is_whitelisted => node_whitelist.include?(node),
:node => node,
:node_name => node.name.downcase,
:node_whitelist => node_whitelist
)
if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
node_whitelist.merge(result[:node_whitelist])
end
end
node
end
# Performs top-down traversal of the given node, operating first on the node
# itself, then traversing each child (if any) in order.
def traverse(node, &block)
yield node
child = node.child
while child do
prev = child.previous_sibling
traverse(child, &block)
if child.parent == node
child = child.next_sibling
else
# The child was unlinked or reparented, so traverse the previous node's
# next sibling, or the parent's first child if there is no previous
# node.
child = prev ? prev.next_sibling : node.child
end
end
end
class Error < StandardError; end
end
# encoding: utf-8
require 'set'
class Sanitize
module Config
# Deeply freezes and returns the given configuration Hash.
def self.freeze_config(config)
if Hash === config
config.each_value {|c| freeze_config(c) }
elsif Array === config || Set === config
config.each {|c| freeze_config(c) }
end
config.freeze
end
# Returns a new Hash containing the result of deeply merging *other_config*
# into *config*. Does not modify *config* or *other_config*.
#
# This is the safest way to use a built-in Sanitize config as the basis for
# your own custom config.
def self.merge(config, other_config = {})
raise ArgumentError, 'config must be a Hash' unless Hash === config
raise ArgumentError, 'other_config must be a Hash' unless Hash === other_config
merged = {}
keys = Set.new(config.keys + other_config.keys)
keys.each do |key|
oldval = config[key]
if other_config.has_key?(key)
newval = other_config[key]
if Hash === oldval && Hash === newval
merged[key] = oldval.empty? ? newval.dup : merge(oldval, newval)
elsif Array === newval && key != :transformers
merged[key] = Set.new(newval)
else
merged[key] = can_dupe?(newval) ? newval.dup : newval
end
else
merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
end
end
merged
end
# Returns `true` if `dup` may be safely called on _value_, `false`
# otherwise.
def self.can_dupe?(value)
!(true == value || false == value || value.nil? || Numeric === value || Symbol === value)
end
private_class_method :can_dupe?
end
end
# encoding: utf-8
class Sanitize
module Config
BASIC = freeze_config(
:elements => RESTRICTED[:elements] + %w[
a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s
samp small strike sub sup time ul var
],
:attributes => {
'a' => %w[href],
'abbr' => %w[title],
'blockquote' => %w[cite],
'dfn' => %w[title],
'q' => %w[cite],
'time' => %w[datetime pubdate]
},
:add_attributes => {
'a' => {'rel' => 'nofollow'}
},
:protocols => {
'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
'blockquote' => {'cite' => ['http', 'https', :relative]},
'q' => {'cite' => ['http', 'https', :relative]}
}
)
end
end
# encoding: utf-8
class Sanitize
module Config
DEFAULT = freeze_config(
# HTML attributes to add to specific elements. By default, no attributes
# are added.
:add_attributes => {},
# Whether or not to allow HTML comments. Allowing comments is strongly
# discouraged, since IE allows script execution within conditional
# comments.
:allow_comments => false,
# Whether or not to allow well-formed HTML doctype declarations such as
# "<!DOCTYPE html>" when sanitizing a document. This setting is ignored
# when sanitizing fragments.
:allow_doctype => false,
# HTML attributes to allow in specific elements. By default, no attributes
# are allowed. Use the symbol :data to indicate that arbitrary HTML5
# data-* attributes should be allowed.
:attributes => {},
# CSS sanitization settings.
:css => {
# Whether or not to allow CSS comments.
:allow_comments => false,
# Whether or not to allow browser compatibility hacks such as the IE *
# and _ hacks. These are generally harmless, but technically result in
# invalid CSS.
:allow_hacks => false,
# CSS at-rules to allow that may not have associated blocks (e.g.
# "import").
#
# https://developer.mozilla.org/en-US/docs/Web/CSS/At-rule
:at_rules => [],
# CSS at-rules to allow whose blocks may contain properties (e.g.
# "font-face").
:at_rules_with_properties => [],
# CSS at-rules to allow whose blocks may contain styles (e.g. "media").
:at_rules_with_styles => [],
# CSS properties to allow.
:properties => [],
# URL protocols to allow in CSS URLs.
:protocols => []
},
# HTML elements to allow. By default, no elements are allowed (which means
# that all HTML will be stripped).
:elements => [],
# URL handling protocols to allow in specific attributes. By default, no
# protocols are allowed. Use :relative in place of a protocol if you want
# to allow relative URLs sans protocol.
:protocols => {},
# If this is true, Sanitize will remove the contents of any filtered
# elements in addition to the elements themselves. By default, Sanitize
# leaves the safe parts of an element's contents behind when the element
# is removed.
#
# If this is an Array of element names, then only the contents of the
# specified elements (when filtered) will be removed, and the contents of
# all other filtered elements will be left behind.
:remove_contents => false,
# Transformers allow you to filter or alter nodes using custom logic. See
# README.md for details and examples.
:transformers => [],
# Elements which, when removed, should have their contents surrounded by
# values specified with `before` and `after` keys to preserve readability.
# For example, `foo<div>bar</div>baz` will become 'foo bar baz' when the
# <div> is removed.
:whitespace_elements => {
'address' => { :before => ' ', :after => ' ' },
'article' => { :before => ' ', :after => ' ' },
'aside' => { :before => ' ', :after => ' ' },
'blockquote' => { :before => ' ', :after => ' ' },
'br' => { :before => ' ', :after => ' ' },
'dd' => { :before => ' ', :after => ' ' },
'div' => { :before => ' ', :after => ' ' },
'dl' => { :before => ' ', :after => ' ' },
'dt' => { :before => ' ', :after => ' ' },
'footer' => { :before => ' ', :after => ' ' },
'h1' => { :before => ' ', :after => ' ' },
'h2' => { :before => ' ', :after => ' ' },
'h3' => { :before => ' ', :after => ' ' },
'h4' => { :before => ' ', :after => ' ' },
'h5' => { :before => ' ', :after => ' ' },
'h6' => { :before => ' ', :after => ' ' },
'header' => { :before => ' ', :after => ' ' },
'hgroup' => { :before => ' ', :after => ' ' },
'hr' => { :before => ' ', :after => ' ' },
'li' => { :before => ' ', :after => ' ' },
'nav' => { :before => ' ', :after => ' ' },
'ol' => { :before => ' ', :after => ' ' },
'p' => { :before => ' ', :after => ' ' },
'pre' => { :before => ' ', :after => ' ' },
'section' => { :before => ' ', :after => ' ' },
'ul' => { :before => ' ', :after => ' ' }
}
)
end
end
# encoding: utf-8
class Sanitize
module Config
RELAXED = freeze_config(
:elements => BASIC[:elements] + %w[
address article aside bdi bdo body caption col colgroup data del div
figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
img ins main nav rp rt ruby section span style summary sup table tbody
td tfoot th thead title tr wbr
],
:allow_doctype => true,
:attributes => merge(BASIC[:attributes],
:all => %w[class dir hidden id lang style tabindex title translate],
'a' => %w[href hreflang name rel],
'col' => %w[span width],
'colgroup' => %w[span width],
'data' => %w[value],
'del' => %w[cite datetime],
'img' => %w[align alt border height src width],
'ins' => %w[cite datetime],
'li' => %w[value],
'ol' => %w[reversed start type],
'style' => %w[media scoped type],
'table' => %w[align bgcolor border cellpadding cellspacing frame rules sortable summary width],
'td' => %w[abbr align axis colspan headers rowspan valign width],
'th' => %w[abbr align axis colspan headers rowspan scope sorted valign width],
'ul' => %w[type]
),
:protocols => merge(BASIC[:protocols],
'del' => {'cite' => ['http', 'https', :relative]},
'img' => {'src' => ['http', 'https', :relative]},
'ins' => {'cite' => ['http', 'https', :relative]}
),
:css => {
:allow_comments => true,
:allow_hacks => true,
:at_rules_with_properties => %w[
bottom-center
bottom-left
bottom-left-corner
bottom-right
bottom-right-corner
font-face
left-bottom
left-middle
left-top
page
right-bottom
right-middle
right-top
top-center
top-left
top-left-corner
top-right
top-right-corner
],
:at_rules_with_styles => %w[
-moz-keyframes
-o-keyframes
-webkit-keyframes
document
keyframes
media
supports
],
:protocols => ['http', 'https', :relative],
:properties => %w[
-moz-appearance
-moz-background-inline-policy
-moz-box-sizing
-moz-column-count
-moz-column-fill
-moz-column-gap
-moz-column-rule
-moz-column-rule-color
-moz-column-rule-style
-moz-column-rule-width
-moz-column-width
-moz-font-feature-settings
-moz-font-language-override
-moz-hyphens
-moz-text-align-last
-moz-text-decoration-color
-moz-text-decoration-line
-moz-text-decoration-style
-moz-text-size-adjust
-ms-background-position-x
-ms-background-position-y
-ms-block-progression
-ms-content-zoom-chaining
-ms-content-zoom-limit
-ms-content-zoom-limit-max
-ms-content-zoom-limit-min
-ms-content-zoom-snap
-ms-content-zoom-snap-points
-ms-content-zoom-snap-type
-ms-content-zooming
-ms-filter
-ms-flex
-ms-flex-align