From 09cbd6c65e628d5fdf15accefae2bd3968fbb54d Mon Sep 17 00:00:00 2001
From: kibigo! <go@kibi.family>
Date: Tue, 8 Nov 2022 20:59:07 -0800
Subject: [PATCH] =?UTF-8?q?Use=20a=20tree=E2=80=90based=20approach=20for?=
 =?UTF-8?q?=20adv.=20text=20formatting?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sanitizing HTML/Markdown means parsing the content into an HTML tree
under‐the‐hood anyway, and it is more accurate to do mention/hashtag
replacement on the text nodes in that tree than it is to try to hack it
in with regexes et cetera.

This undoes the overrides of `#entities` and `#rewrite` on
`AdvancedTextFormatter` but also stops using them, instead keeping
track of the parsed Nokogiri tree itself and using that in the `#to_s`
method.

Internally, this tree uses `<mastodon-entity>` nodes to keep track of
hashtags, links, and mentions. Sanitization is moved to the beginning,
so it should be known that these do not appear in the input.
---
 app/lib/advanced_text_formatter.rb | 116 +++++++++++++++++------------
 1 file changed, 69 insertions(+), 47 deletions(-)
diff --git a/app/lib/advanced_text_formatter.rb b/app/lib/advanced_text_formatter.rb
index dcaf34b91..4917f8e6f 100644
--- a/app/lib/advanced_text_formatter.rb
+++ b/app/lib/advanced_text_formatter.rb
@@ -33,67 +33,89 @@ class AdvancedTextFormatter < TextFormatter
     @text = format_markdown(text) if content_type == 'text/markdown'
   end
 
-  # Differs from TextFormatter by not messing with newline after parsing
+  # Differs from TextFormatter by operating on the parsed HTML tree ;)
+  #
+  # See +#tree+
   def to_s
     return ''.html_safe if text.blank?
 
-    html = rewrite do |entity|
-      if entity[:url]
-        link_to_url(entity)
-      elsif entity[:hashtag]
-        link_to_hashtag(entity)
-      elsif entity[:screen_name]
-        link_to_mention(entity)
+    result = tree.dup
+    result.css('mastodon-entity').each do |entity|
+      case entity['kind']
+      when 'hashtag'
+        entity.replace(link_to_hashtag({ hashtag: entity['value'] }))
+      when 'link'
+        entity.replace(link_to_url({ url: entity['value'] }))
+      when 'mention'
+        entity.replace(link_to_mention({ screen_name: entity['value'] }))
       end
     end
-
-    html.html_safe # rubocop:disable Rails/OutputSafety
+    result.to_html
   end
 
-  # Differs from `TextFormatter` by skipping HTML tags and entities
-  def entities
-    @entities ||= begin
-      gaps = []
-      total_offset = 0
+  ##
+  # Process the status into a Nokogiri document fragment, with entities
+  # replaced with +<mastodon-entity>+s.
+  #
+  # Since +<mastodon-entity>+ is not allowed by the sanitizer, any such
+  # elements in the output *must* have been produced by this algorithm.
+  #
+  # These elements will need to be replaced prior to serialization (see
+  # +#to_s+).
+  def tree
+    if @tree.nil?
+      src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
+      @tree = Nokogiri::HTML5.fragment(src)
+      Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING)
+      document = @tree.document
 
-      escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
-        total_offset += match.length - 1
-        end_offset = Regexp.last_match.end(0)
-        gaps << [end_offset - total_offset, total_offset]
-        ' '
-      end
-
-      Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity|
-        start_pos, end_pos = entity[:indices]
-        offset_idx = gaps.rindex { |gap| gap.first <= start_pos }
-        offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
-        entity.merge(indices: [start_pos + offset, end_pos + offset])
+      @tree.xpath('.//text()[not(ancestor::a)]').each do |text_node|
+        # Iterate over text elements and build up their replacements.
+        content = text_node.content
+        replacement = Nokogiri::XML::NodeSet.new(document)
+        processed_index = 0
+        Extractor.extract_entities_with_indices(
+          content,
+          extract_url_without_protocol: false
+        ) do |entity|
+          # Iterate over entities in this text node.
+          advance = entity[:indices].first - processed_index
+          if advance.positive?
+            # Text node for content which precedes entity.
+            replacement << Nokogiri::XML::Text.new(
+              content[processed_index, advance],
+              @tree.document
+            )
+          end
+          elt = Nokogiri::XML::Element.new('mastodon-entity', document)
+          if entity[:url]
+            elt['kind'] = 'link'
+            elt['value'] = entity[:url]
+          elsif entity[:hashtag]
+            elt['kind'] = 'hashtag'
+            elt['value'] = entity[:hashtag]
+          elsif entity[:screen_name]
+            elt['kind'] = 'mention'
+            elt['value'] = entity[:screen_name]
+          end
+          replacement << elt
+          processed_index = entity[:indices].last
+        end
+        if processed_index < content.size
+          # Text node for remaining content.
+          replacement << Nokogiri::XML::Text.new(
+            content[processed_index, content.size - processed_index],
+            document
+          )
+        end
+        text_node.replace(replacement)
       end
     end
+    @tree
   end
 
   private
 
-  # Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe
-  def rewrite
-    entities.sort_by! do |entity|
-      entity[:indices].first
-    end
-
-    result = ''.dup
-
-    last_index = entities.reduce(0) do |index, entity|
-      indices = entity[:indices]
-      result << text[index...indices.first]
-      result << yield(entity)
-      indices.last
-    end
-
-    result << text[last_index..-1]
-
-    Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING)
-  end
-
   def format_markdown(html)
     html = markdown_formatter.render(html)
     html.delete("\r").delete("\n")