Commit dff667d0 authored by Héctor Orón Martínez's avatar Héctor Orón Martínez

Merge branch 'debian/sanitize' into 'debian/master'

obs-api: embed sanitize-4.0.0 to fix runtime issue

See merge request ruby-team/open-build-service!4
parents d436b01c 279feefd
open-build-service (2.7.4-3) UNRELEASED; urgency=medium
* Embed sanitize 4.0.0 ruby gem to fix breakeage.
- add obs-api runtime depends on ruby-nokogumbo and ruby-crass.
* worker: document enable switch
* worker: use /var/lib/obsworker as OBS_RUN_DIR
......
......@@ -163,6 +163,7 @@ Depends: apache2,
ruby-codemirror-rails,
ruby-coderay,
ruby-crack,
ruby-crass,
ruby-cssmin,
ruby-daemons,
ruby-dalli,
......@@ -191,6 +192,7 @@ Depends: apache2,
ruby-mime-types,
ruby-mysql2,
ruby-nokogiri,
ruby-nokogumbo,
ruby-parser,
ruby-pkg-config (>= 1.1.6),
ruby-pundit,
......
# encoding: utf-8
require 'nokogumbo'
require 'set'
require_relative 'sanitize/version'
require_relative 'sanitize/config'
require_relative 'sanitize/config/default'
require_relative 'sanitize/config/restricted'
require_relative 'sanitize/config/basic'
require_relative 'sanitize/config/relaxed'
require_relative 'sanitize/css'
require_relative 'sanitize/transformers/clean_cdata'
require_relative 'sanitize/transformers/clean_comment'
require_relative 'sanitize/transformers/clean_css'
require_relative 'sanitize/transformers/clean_doctype'
require_relative 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
# Matches Unicode characters that should be stripped from HTML before passing
# it to the parser.
#
# http://www.w3.org/TR/unicode-xml/#Charlist
REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
#--
# Class Methods
#++
# Returns a sanitized copy of the given full _html_ document, using the
# settings in _config_ if specified.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def self.document(html, config = {})
Sanitize.new(config).document(html)
end
# Returns a sanitized copy of the given _html_ fragment, using the settings in
# _config_ if specified.
def self.fragment(html, config = {})
Sanitize.new(config).fragment(html)
end
# Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
def self.node!(node, config = {})
Sanitize.new(config).node!(node)
end
# Aliases for pre-3.0.0 backcompat.
class << Sanitize
# @deprecated Use {.document} instead.
alias_method :clean_document, :document
# @deprecated Use {.fragment} instead.
alias_method :clean, :fragment
# @deprecated Use {.node!} instead.
alias_method :clean_node!, :node!
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT, config)
@transformers = Array(@config[:transformers].dup)
# Default transformers always run at the end of the chain, after any custom
# transformers.
@transformers << Transformers::CleanComment unless @config[:allow_comments]
@transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
if @config[:elements].include?('style')
scss = Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanElement.new(scss)
end
if @config[:attributes].values.any? {|attr| attr.include?('style') }
scss ||= Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanAttribute.new(scss)
end
@transformers <<
Transformers::CleanCDATA <<
Transformers::CleanElement.new(@config)
end
# Returns a sanitized copy of the given _html_ document.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def document(html)
return '' unless html
doc = Nokogiri::HTML5.parse(preprocess(html))
node!(doc)
to_html(doc)
end
# @deprecated Use {#document} instead.
alias_method :clean_document, :document
# Returns a sanitized copy of the given _html_ fragment.
def fragment(html)
return '' unless html
html = preprocess(html)
doc = Nokogiri::HTML5.parse("<html><body>#{html}")
# Hack to allow fragments containing <body>. Borrowed from
# Nokogiri::HTML::DocumentFragment.
if html =~ /\A<body(?:\s|>)/i
path = '/html/body'
else
path = '/html/body/node()'
end
frag = doc.fragment
frag << doc.xpath(path)
node!(frag)
to_html(frag)
end
# @deprecated Use {#fragment} instead.
alias_method :clean, :fragment
# Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
# in place.
#
# If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
# whitelisted or an error will be raised.
def node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
if node.is_a?(Nokogiri::XML::Document)
unless @config[:elements].include?('html')
raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
end
end
node_whitelist = Set.new
traverse(node) do |n|
transform_node!(n, node_whitelist)
end
node
end
# @deprecated Use {#node!} instead.
alias_method :clean_node!, :node!
private
# Preprocesses HTML before parsing to remove undesirable Unicode chars.
def preprocess(html)
html = html.to_s.dup
unless html.encoding.name == 'UTF-8'
html.encode!('UTF-8',
:invalid => :replace,
:undef => :replace)
end
html.gsub!(REGEX_UNSUITABLE_CHARS, '')
html
end
def to_html(node)
replace_meta = false
# Hacky workaround for a libxml2 bug that adds an undesired Content-Type
# meta tag to all serialized HTML documents.
#
# https://github.com/sparklemotion/nokogiri/issues/1008
if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
# Only replace the content-type meta tag if <meta> isn't whitelisted or
# the original document didn't actually include a content-type meta tag.
replace_meta = !@config[:elements].include?('meta') ||
node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
meta['http-equiv'].downcase == 'content-type'
end
end
so = Nokogiri::XML::Node::SaveOptions
# Serialize to HTML without any formatting to prevent Nokogiri from adding
# newlines after certain tags.
html = node.to_html(
:encoding => 'utf-8',
:indent => 0,
:save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
)
html.gsub!(regex_meta, '\1') if replace_meta
html
end
def transform_node!(node, node_whitelist)
@transformers.each do |transformer|
result = transformer.call(
:config => @config,
:is_whitelisted => node_whitelist.include?(node),
:node => node,
:node_name => node.name.downcase,
:node_whitelist => node_whitelist
)
if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
node_whitelist.merge(result[:node_whitelist])
end
end
node
end
# Performs top-down traversal of the given node, operating first on the node
# itself, then traversing each child (if any) in order.
def traverse(node, &block)
yield node
child = node.child
while child do
prev = child.previous_sibling
traverse(child, &block)
if child.parent == node
child = child.next_sibling
else
# The child was unlinked or reparented, so traverse the previous node's
# next sibling, or the parent's first child if there is no previous
# node.
child = prev ? prev.next_sibling : node.child
end
end
end
class Error < StandardError; end
end
# encoding: utf-8
require 'set'
class Sanitize
module Config
# Deeply freezes and returns the given configuration Hash.
def self.freeze_config(config)
if Hash === config
config.each_value {|c| freeze_config(c) }
elsif Array === config || Set === config
config.each {|c| freeze_config(c) }
end
config.freeze
end
# Returns a new Hash containing the result of deeply merging *other_config*
# into *config*. Does not modify *config* or *other_config*.
#
# This is the safest way to use a built-in Sanitize config as the basis for
# your own custom config.
def self.merge(config, other_config = {})
raise ArgumentError, 'config must be a Hash' unless Hash === config
raise ArgumentError, 'other_config must be a Hash' unless Hash === other_config
merged = {}
keys = Set.new(config.keys + other_config.keys)
keys.each do |key|
oldval = config[key]
if other_config.has_key?(key)
newval = other_config[key]
if Hash === oldval && Hash === newval
merged[key] = oldval.empty? ? newval.dup : merge(oldval, newval)
elsif Array === newval && key != :transformers
merged[key] = Set.new(newval)
else
merged[key] = can_dupe?(newval) ? newval.dup : newval
end
else
merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
end
end
merged
end
# Returns `true` if `dup` may be safely called on _value_, `false`
# otherwise.
def self.can_dupe?(value)
!(true == value || false == value || value.nil? || Numeric === value || Symbol === value)
end
private_class_method :can_dupe?
end
end
# encoding: utf-8
class Sanitize
module Config
BASIC = freeze_config(
:elements => RESTRICTED[:elements] + %w[
a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s
samp small strike sub sup time ul var
],
:attributes => {
'a' => %w[href],
'abbr' => %w[title],
'blockquote' => %w[cite],
'dfn' => %w[title],
'q' => %w[cite],
'time' => %w[datetime pubdate]
},
:add_attributes => {
'a' => {'rel' => 'nofollow'}
},
:protocols => {
'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
'blockquote' => {'cite' => ['http', 'https', :relative]},
'q' => {'cite' => ['http', 'https', :relative]}
}
)
end
end
# encoding: utf-8
class Sanitize
module Config
DEFAULT = freeze_config(
# HTML attributes to add to specific elements. By default, no attributes
# are added.
:add_attributes => {},
# Whether or not to allow HTML comments. Allowing comments is strongly
# discouraged, since IE allows script execution within conditional
# comments.
:allow_comments => false,
# Whether or not to allow well-formed HTML doctype declarations such as
# "<!DOCTYPE html>" when sanitizing a document. This setting is ignored
# when sanitizing fragments.
:allow_doctype => false,
# HTML attributes to allow in specific elements. By default, no attributes
# are allowed. Use the symbol :data to indicate that arbitrary HTML5
# data-* attributes should be allowed.
:attributes => {},
# CSS sanitization settings.
:css => {
# Whether or not to allow CSS comments.
:allow_comments => false,
# Whether or not to allow browser compatibility hacks such as the IE *
# and _ hacks. These are generally harmless, but technically result in
# invalid CSS.
:allow_hacks => false,
# CSS at-rules to allow that may not have associated blocks (e.g.
# "import").
#
# https://developer.mozilla.org/en-US/docs/Web/CSS/At-rule
:at_rules => [],
# CSS at-rules to allow whose blocks may contain properties (e.g.
# "font-face").
:at_rules_with_properties => [],
# CSS at-rules to allow whose blocks may contain styles (e.g. "media").
:at_rules_with_styles => [],
# CSS properties to allow.
:properties => [],
# URL protocols to allow in CSS URLs.
:protocols => []
},
# HTML elements to allow. By default, no elements are allowed (which means
# that all HTML will be stripped).
:elements => [],
# URL handling protocols to allow in specific attributes. By default, no
# protocols are allowed. Use :relative in place of a protocol if you want
# to allow relative URLs sans protocol.
:protocols => {},
# If this is true, Sanitize will remove the contents of any filtered
# elements in addition to the elements themselves. By default, Sanitize
# leaves the safe parts of an element's contents behind when the element
# is removed.
#
# If this is an Array of element names, then only the contents of the
# specified elements (when filtered) will be removed, and the contents of
# all other filtered elements will be left behind.
:remove_contents => false,
# Transformers allow you to filter or alter nodes using custom logic. See
# README.md for details and examples.
:transformers => [],
# Elements which, when removed, should have their contents surrounded by
# values specified with `before` and `after` keys to preserve readability.
# For example, `foo<div>bar</div>baz` will become 'foo bar baz' when the
# <div> is removed.
:whitespace_elements => {
'address' => { :before => ' ', :after => ' ' },
'article' => { :before => ' ', :after => ' ' },
'aside' => { :before => ' ', :after => ' ' },
'blockquote' => { :before => ' ', :after => ' ' },
'br' => { :before => ' ', :after => ' ' },
'dd' => { :before => ' ', :after => ' ' },
'div' => { :before => ' ', :after => ' ' },
'dl' => { :before => ' ', :after => ' ' },
'dt' => { :before => ' ', :after => ' ' },
'footer' => { :before => ' ', :after => ' ' },
'h1' => { :before => ' ', :after => ' ' },
'h2' => { :before => ' ', :after => ' ' },
'h3' => { :before => ' ', :after => ' ' },
'h4' => { :before => ' ', :after => ' ' },
'h5' => { :before => ' ', :after => ' ' },
'h6' => { :before => ' ', :after => ' ' },
'header' => { :before => ' ', :after => ' ' },
'hgroup' => { :before => ' ', :after => ' ' },
'hr' => { :before => ' ', :after => ' ' },
'li' => { :before => ' ', :after => ' ' },
'nav' => { :before => ' ', :after => ' ' },
'ol' => { :before => ' ', :after => ' ' },
'p' => { :before => ' ', :after => ' ' },
'pre' => { :before => ' ', :after => ' ' },
'section' => { :before => ' ', :after => ' ' },
'ul' => { :before => ' ', :after => ' ' }
}
)
end
end
This diff is collapsed.
# encoding: utf-8
class Sanitize
module Config
RESTRICTED = freeze_config(
:elements => %w[b em i strong u]
)
end
end
# encoding: utf-8
require 'crass'
require 'set'
class Sanitize; class CSS
attr_reader :config
# -- Class Methods -----------------------------------------------------------
# Sanitizes inline CSS style properties.
#
# This is most useful for sanitizing non-stylesheet fragments of CSS like you
# would find in the `style` attribute of an HTML element. To sanitize a full
# CSS stylesheet, use {.stylesheet}.
#
# @example
# Sanitize::CSS.properties("background: url(foo.png); color: #fff;")
#
# @return [String] Sanitized CSS properties.
def self.properties(css, config = {})
self.new(config).properties(css)
end
# Sanitizes a full CSS stylesheet.
#
# A stylesheet may include selectors, at-rules, and comments. To sanitize only
# inline style properties such as the contents of an HTML `style` attribute,
# use {.properties}.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED)
#
# @return [String] Sanitized CSS stylesheet.
def self.stylesheet(css, config = {})
self.new(config).stylesheet(css)
end
# Sanitizes the given Crass CSS parse tree and all its children, modifying it
# in place.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# tree = Crass.parse(css)
# Sanitize::CSS.tree!(tree, Sanitize::Config::RELAXED)
#
# @return [Array] Sanitized Crass CSS parse tree.
def self.tree!(tree, config = {})
self.new(config).tree!(tree)
end
# -- Instance Methods --------------------------------------------------------
# Returns a new Sanitize::CSS object initialized with the settings in
# _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT[:css], config[:css] || config)
@at_rules = Set.new(@config[:at_rules])
@at_rules_with_properties = Set.new(@config[:at_rules_with_properties])
@at_rules_with_styles = Set.new(@config[:at_rules_with_styles])
end
# Sanitizes inline CSS style properties.
#
# This is most useful for sanitizing non-stylesheet fragments of CSS like you
# would find in the `style` attribute of an HTML element. To sanitize a full
# CSS stylesheet, use {#stylesheet}.
#
# @example
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# scss.properties("background: url(foo.png); color: #fff;")
#
# @return [String] Sanitized CSS properties.
def properties(css)
tree = Crass.parse_properties(css,
:preserve_comments => @config[:allow_comments],
:preserve_hacks => @config[:allow_hacks])
tree!(tree)
Crass::Parser.stringify(tree)
end
# Sanitizes a full CSS stylesheet.
#
# A stylesheet may include selectors, at-rules, and comments. To sanitize only
# inline style properties such as the contents of an HTML `style` attribute,
# use {#properties}.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# scss.stylesheet(css)
#
# @return [String] Sanitized CSS stylesheet.
def stylesheet(css)
tree = Crass.parse(css,
:preserve_comments => @config[:allow_comments],
:preserve_hacks => @config[:allow_hacks])
tree!(tree)
Crass::Parser.stringify(tree)
end
# Sanitizes the given Crass CSS parse tree and all its children, modifying it
# in place.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# tree = Crass.parse(css)
#
# scss.tree!(tree)
#
# @return [Array] Sanitized Crass CSS parse tree.
def tree!(tree)
preceded_by_property = false
tree.map! do |node|
next nil if node.nil?
case node[:node]
when :at_rule
preceded_by_property = false
next at_rule!(node)