pipeline.rb 8.26 KB
Newer Older
1 2
require 'nokogiri'
require 'active_support/xml_mini/nokogiri' # convert Documents to hashes
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34

module HTML
  # GitHub HTML processing filters and utilities. This module includes a small
  # framework for defining DOM based content filters and applying them to user
  # provided content.
  #
  # See HTML::Pipeline::Filter for information on building filters.
  #
  # Construct a Pipeline for running multiple HTML filters.  A pipeline is created once
  # with one to many filters, and it then can be `call`ed many times over the course
  # of its lifetime with input.
  #
  # filters         - Array of Filter objects. Each must respond to call(doc,
  #                   context) and return the modified DocumentFragment or a
  #                   String containing HTML markup. Filters are performed in the
  #                   order provided.
  # default_context - The default context hash. Values specified here will be merged
  #                   into values from the each individual pipeline run.  Can NOT be
  #                   nil.  Default: empty Hash.
  # result_class    - The default Class of the result object for individual
  #                   calls.  Default: Hash.  Protip:  Pass in a Struct to get
  #                   some semblance of type safety.
  class Pipeline
    autoload :VERSION,               'html/pipeline/version'
    autoload :Filter,                'html/pipeline/filter'
    autoload :AbsoluteSourceFilter,  'html/pipeline/absolute_source_filter'
    autoload :BodyContent,           'html/pipeline/body_content'
    autoload :AutolinkFilter,        'html/pipeline/autolink_filter'
    autoload :CamoFilter,            'html/pipeline/camo_filter'
    autoload :EmailReplyFilter,      'html/pipeline/email_reply_filter'
    autoload :EmojiFilter,           'html/pipeline/emoji_filter'
    autoload :HttpsFilter,           'html/pipeline/https_filter'
35
    autoload :ImageFilter,           'html/pipeline/image_filter'
36 37 38 39 40 41 42 43 44 45
    autoload :ImageMaxWidthFilter,   'html/pipeline/image_max_width_filter'
    autoload :MarkdownFilter,        'html/pipeline/markdown_filter'
    autoload :MentionFilter,         'html/pipeline/@mention_filter'
    autoload :PlainTextInputFilter,  'html/pipeline/plain_text_input_filter'
    autoload :SanitizationFilter,    'html/pipeline/sanitization_filter'
    autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
    autoload :TextileFilter,         'html/pipeline/textile_filter'
    autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
    autoload :TextFilter,            'html/pipeline/text_filter'

46 47 48 49 50 51 52 53
    class MissingDependencyError < RuntimeError; end
    def self.require_dependency(name, requirer)
      require name
    rescue LoadError => e
      raise MissingDependencyError,
            "Missing dependency '#{name}' for #{requirer}. See README.md for details.\n#{e.class.name}: #{e}"
    end

54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    # Our DOM implementation.
    DocumentFragment = Nokogiri::HTML::DocumentFragment

    # Parse a String into a DocumentFragment object. When a DocumentFragment is
    # provided, return it verbatim.
    def self.parse(document_or_html)
      document_or_html ||= ''
      if document_or_html.is_a?(String)
        DocumentFragment.parse(document_or_html)
      else
        document_or_html
      end
    end

    # Public: Returns an Array of Filter objects for this Pipeline.
    attr_reader :filters

    # Public: Instrumentation service for the pipeline.
    # Set an ActiveSupport::Notifications compatible object to enable.
    attr_accessor :instrumentation_service

    # Public: String name for this Pipeline. Defaults to Class name.
    attr_writer :instrumentation_name
    def instrumentation_name
78 79
      return @instrumentation_name if defined?(@instrumentation_name)
      @instrumentation_name = self.class.name
80 81 82 83 84 85 86 87
    end

    class << self
      # Public: Default instrumentation service for new pipeline objects.
      attr_accessor :default_instrumentation_service
    end

    def initialize(filters, default_context = {}, result_class = nil)
88
      raise ArgumentError, 'default_context cannot be nil' if default_context.nil?
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
      @filters = filters.flatten.freeze
      @default_context = default_context.freeze
      @result_class = result_class || Hash
      @instrumentation_service = self.class.default_instrumentation_service
    end

    # Apply all filters in the pipeline to the given HTML.
    #
    # html    - A String containing HTML or a DocumentFragment object.
    # context - The context hash passed to each filter. See the Filter docs
    #           for more info on possible values. This object MUST NOT be modified
    #           in place by filters.  Use the Result for passing state back.
    # result  - The result Hash passed to each filter for modification.  This
    #           is where Filters store extracted information from the content.
    #
    # Returns the result Hash after being filtered by this Pipeline.  Contains an
    # :output key with the DocumentFragment or String HTML markup based on the
    # output of the last filter in the pipeline.
    def call(html, context = {}, result = nil)
      context = @default_context.merge(context)
      context = context.freeze
      result ||= @result_class.new
111 112 113
      payload = default_payload filters: @filters.map(&:name),
                                context: context, result: result
      instrument 'call_pipeline.html_pipeline', payload do
114 115 116 117 118 119 120 121 122 123 124 125 126 127
        result[:output] =
          @filters.inject(html) do |doc, filter|
            perform_filter(filter, doc, context, result)
          end
      end
      result
    end

    # Internal: Applies a specific filter to the supplied doc.
    #
    # The filter is instrumented.
    #
    # Returns the result of the filter.
    def perform_filter(filter, doc, context, result)
128 129 130
      payload = default_payload filter: filter.name,
                                context: context, result: result
      instrument 'call_filter.html_pipeline', payload do
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
        filter.call(doc, context, result)
      end
    end

    # Like call but guarantee the value returned is a DocumentFragment.
    # Pipelines may return a DocumentFragment or a String. Callers that need a
    # DocumentFragment should use this method.
    def to_document(input, context = {}, result = nil)
      result = call(input, context, result)
      HTML::Pipeline.parse(result[:output])
    end

    # Like call but guarantee the value returned is a string of HTML markup.
    def to_html(input, context = {}, result = nil)
      result = call(input, context, result = nil)
      output = result[:output]
      if output.respond_to?(:to_html)
        output.to_html
      else
        output.to_s
      end
    end

    # Public: setup instrumentation for this pipeline.
    #
    # Returns nothing.
    def setup_instrumentation(name = nil, service = nil)
      self.instrumentation_name = name
      self.instrumentation_service =
        service || self.class.default_instrumentation_service
    end

    # Internal: if the `instrumentation_service` object is set, instruments the
    # block, otherwise the block is ran without instrumentation.
    #
    # Returns the result of the provided block.
    def instrument(event, payload = nil)
      payload ||= default_payload
      return yield(payload) unless instrumentation_service
      instrumentation_service.instrument event, payload do |payload|
        yield payload
      end
    end

    # Internal: Default payload for instrumentation.
    #
    # Accepts a Hash of additional payload data to be merged.
    #
    # Returns a Hash.
    def default_payload(payload = {})
181
      { pipeline: instrumentation_name }.merge(payload)
182 183 184 185 186
    end
  end
end

# XXX nokogiri monkey patches for 1.8
187
unless ''.respond_to?(:force_encoding)
188 189 190 191 192 193 194 195 196 197 198
  class Nokogiri::XML::Node
    # Work around an issue with utf-8 encoded data being erroneously converted to
    # ... some other shit when replacing text nodes. See 'utf-8 output 2' in
    # user_content_test.rb for details.
    def replace_with_encoding_fix(replacement)
      if replacement.respond_to?(:to_str)
        replacement = document.fragment("<div>#{replacement}</div>").children.first.children
      end
      replace_without_encoding_fix(replacement)
    end

199 200
    alias replace_without_encoding_fix replace
    alias replace replace_with_encoding_fix
201 202 203 204 205 206 207

    def swap(replacement)
      replace(replacement)
      self
    end
  end
end