Use a tree‐based approach for adv. text formatting

Sanitizing HTML/Markdown means parsing the content into an HTML tree
under‐the‐hood anyway, and it is more accurate to do mention/hashtag
replacement on the text nodes in that tree than it is to try to hack it
in with regexes et cetera.

This undoes the overrides of `#entities` and `#rewrite` on
`AdvancedTextFormatter` but also stops using them, instead keeping
track of the parsed Nokogiri tree itself and using that in the `#to_s`
method.

Internally, this tree uses `<mastodon-entity>` nodes to keep track of
hashtags, links, and mentions. Sanitization is moved to the beginning,
so it should be known that these do not appear in the input.
This commit is contained in:
kibigo! 2022-11-08 20:59:07 -08:00
parent b2a25d446a
commit 09cbd6c65e
No known key found for this signature in database
GPG Key ID: 6B751C35B7EABB6F
1 changed files with 69 additions and 47 deletions

View File

@ -33,67 +33,89 @@ class AdvancedTextFormatter < TextFormatter
@text = format_markdown(text) if content_type == 'text/markdown'
end
# Differs from TextFormatter by not messing with newline after parsing
# Differs from TextFormatter by operating on the parsed HTML tree ;)
#
# See +#tree+
def to_s
return ''.html_safe if text.blank?
html = rewrite do |entity|
if entity[:url]
link_to_url(entity)
elsif entity[:hashtag]
link_to_hashtag(entity)
elsif entity[:screen_name]
link_to_mention(entity)
result = tree.dup
result.css('mastodon-entity').each do |entity|
case entity['kind']
when 'hashtag'
entity.replace(link_to_hashtag({ hashtag: entity['value'] }))
when 'link'
entity.replace(link_to_url({ url: entity['value'] }))
when 'mention'
entity.replace(link_to_mention({ screen_name: entity['value'] }))
end
end
html.html_safe # rubocop:disable Rails/OutputSafety
result.to_html
end
# Differs from `TextFormatter` by skipping HTML tags and entities
def entities
@entities ||= begin
gaps = []
total_offset = 0
##
# Process the status into a Nokogiri document fragment, with entities
# replaced with +<mastodon-entity>+s.
#
# Since +<mastodon-entity>+ is not allowed by the sanitizer, any such
# elements in the output *must* have been produced by this algorithm.
#
# These elements will need to be replaced prior to serialization (see
# +#to_s+).
def tree
if @tree.nil?
src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
@tree = Nokogiri::HTML5.fragment(src)
Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING)
document = @tree.document
escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
total_offset += match.length - 1
end_offset = Regexp.last_match.end(0)
gaps << [end_offset - total_offset, total_offset]
' '
end
Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity|
start_pos, end_pos = entity[:indices]
offset_idx = gaps.rindex { |gap| gap.first <= start_pos }
offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
entity.merge(indices: [start_pos + offset, end_pos + offset])
@tree.xpath('.//text()[not(ancestor::a)]').each do |text_node|
# Iterate over text elements and build up their replacements.
content = text_node.content
replacement = Nokogiri::XML::NodeSet.new(document)
processed_index = 0
Extractor.extract_entities_with_indices(
content,
extract_url_without_protocol: false
) do |entity|
# Iterate over entities in this text node.
advance = entity[:indices].first - processed_index
if advance.positive?
# Text node for content which precedes entity.
replacement << Nokogiri::XML::Text.new(
content[processed_index, advance],
@tree.document
)
end
elt = Nokogiri::XML::Element.new('mastodon-entity', document)
if entity[:url]
elt['kind'] = 'link'
elt['value'] = entity[:url]
elsif entity[:hashtag]
elt['kind'] = 'hashtag'
elt['value'] = entity[:hashtag]
elsif entity[:screen_name]
elt['kind'] = 'mention'
elt['value'] = entity[:screen_name]
end
replacement << elt
processed_index = entity[:indices].last
end
if processed_index < content.size
# Text node for remaining content.
replacement << Nokogiri::XML::Text.new(
content[processed_index, content.size - processed_index],
document
)
end
text_node.replace(replacement)
end
end
@tree
end
private
# Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe
def rewrite
entities.sort_by! do |entity|
entity[:indices].first
end
result = ''.dup
last_index = entities.reduce(0) do |index, entity|
indices = entity[:indices]
result << text[index...indices.first]
result << yield(entity)
indices.last
end
result << text[last_index..-1]
Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING)
end
def format_markdown(html)
html = markdown_formatter.render(html)
html.delete("\r").delete("\n")