From 09cbd6c65e628d5fdf15accefae2bd3968fbb54d Mon Sep 17 00:00:00 2001 From: kibigo! Date: Tue, 8 Nov 2022 20:59:07 -0800 Subject: [PATCH] =?UTF-8?q?Use=20a=20tree=E2=80=90based=20approach=20for?= =?UTF-8?q?=20adv.=20text=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sanitizing HTML/Markdown means parsing the content into an HTML tree under‐the‐hood anyway, and it is more accurate to do mention/hashtag replacement on the text nodes in that tree than it is to try to hack it in with regexes et cetera. This undoes the overrides of `#entities` and `#rewrite` on `AdvancedTextFormatter` but also stops using them, instead keeping track of the parsed Nokogiri tree itself and using that in the `#to_s` method. Internally, this tree uses `` nodes to keep track of hashtags, links, and mentions. Sanitization is moved to the beginning, so it should be known that these do not appear in the input. --- app/lib/advanced_text_formatter.rb | 116 +++++++++++++++++------------ 1 file changed, 69 insertions(+), 47 deletions(-) diff --git a/app/lib/advanced_text_formatter.rb b/app/lib/advanced_text_formatter.rb index dcaf34b91..4917f8e6f 100644 --- a/app/lib/advanced_text_formatter.rb +++ b/app/lib/advanced_text_formatter.rb @@ -33,67 +33,89 @@ class AdvancedTextFormatter < TextFormatter @text = format_markdown(text) if content_type == 'text/markdown' end - # Differs from TextFormatter by not messing with newline after parsing + # Differs from TextFormatter by operating on the parsed HTML tree ;) + # + # See +#tree+ def to_s return ''.html_safe if text.blank? - html = rewrite do |entity| - if entity[:url] - link_to_url(entity) - elsif entity[:hashtag] - link_to_hashtag(entity) - elsif entity[:screen_name] - link_to_mention(entity) + result = tree.dup + result.css('mastodon-entity').each do |entity| + case entity['kind'] + when 'hashtag' + entity.replace(link_to_hashtag({ hashtag: entity['value'] })) + when 'link' + entity.replace(link_to_url({ url: entity['value'] })) + when 'mention' + entity.replace(link_to_mention({ screen_name: entity['value'] })) end end - - html.html_safe # rubocop:disable Rails/OutputSafety + result.to_html end - # Differs from `TextFormatter` by skipping HTML tags and entities - def entities - @entities ||= begin - gaps = [] - total_offset = 0 + ## + # Process the status into a Nokogiri document fragment, with entities + # replaced with ++s. + # + # Since ++ is not allowed by the sanitizer, any such + # elements in the output *must* have been produced by this algorithm. + # + # These elements will need to be replaced prior to serialization (see + # +#to_s+). + def tree + if @tree.nil? + src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '') + @tree = Nokogiri::HTML5.fragment(src) + Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING) + document = @tree.document - escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match| - total_offset += match.length - 1 - end_offset = Regexp.last_match.end(0) - gaps << [end_offset - total_offset, total_offset] - ' ' - end - - Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity| - start_pos, end_pos = entity[:indices] - offset_idx = gaps.rindex { |gap| gap.first <= start_pos } - offset = offset_idx.nil? ? 0 : gaps[offset_idx].last - entity.merge(indices: [start_pos + offset, end_pos + offset]) + @tree.xpath('.//text()[not(ancestor::a)]').each do |text_node| + # Iterate over text elements and build up their replacements. + content = text_node.content + replacement = Nokogiri::XML::NodeSet.new(document) + processed_index = 0 + Extractor.extract_entities_with_indices( + content, + extract_url_without_protocol: false + ) do |entity| + # Iterate over entities in this text node. + advance = entity[:indices].first - processed_index + if advance.positive? + # Text node for content which precedes entity. + replacement << Nokogiri::XML::Text.new( + content[processed_index, advance], + @tree.document + ) + end + elt = Nokogiri::XML::Element.new('mastodon-entity', document) + if entity[:url] + elt['kind'] = 'link' + elt['value'] = entity[:url] + elsif entity[:hashtag] + elt['kind'] = 'hashtag' + elt['value'] = entity[:hashtag] + elsif entity[:screen_name] + elt['kind'] = 'mention' + elt['value'] = entity[:screen_name] + end + replacement << elt + processed_index = entity[:indices].last + end + if processed_index < content.size + # Text node for remaining content. + replacement << Nokogiri::XML::Text.new( + content[processed_index, content.size - processed_index], + document + ) + end + text_node.replace(replacement) end end + @tree end private - # Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe - def rewrite - entities.sort_by! do |entity| - entity[:indices].first - end - - result = ''.dup - - last_index = entities.reduce(0) do |index, entity| - indices = entity[:indices] - result << text[index...indices.first] - result << yield(entity) - indices.last - end - - result << text[last_index..-1] - - Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING) - end - def format_markdown(html) html = markdown_formatter.render(html) html.delete("\r").delete("\n")