From 431503bae2dc6e12bdade7d5d20f707112c2f7c2 Mon Sep 17 00:00:00 2001 From: David Yip Date: Mon, 6 Nov 2017 16:48:36 -0600 Subject: [PATCH 1/5] Also run the keyword matcher on a status' tags. #208. --- app/lib/feed_manager.rb | 8 ++++++-- spec/lib/feed_manager_spec.rb | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/app/lib/feed_manager.rb b/app/lib/feed_manager.rb index 3b16b5d52..414632a8a 100644 --- a/app/lib/feed_manager.rb +++ b/app/lib/feed_manager.rb @@ -173,10 +173,14 @@ class FeedManager def keyword_filter?(status, matcher) should_filter = matcher =~ status.text should_filter ||= matcher =~ status.spoiler_text + should_filter ||= status.tags.find_each.any? { |t| matcher =~ t.name } if status.reblog? - should_filter ||= matcher =~ status.reblog.text - should_filter ||= matcher =~ status.reblog.spoiler_text + reblog = status.reblog + + should_filter ||= matcher =~ reblog.text + should_filter ||= matcher =~ reblog.spoiler_text + should_filter ||= reblog.tags.find_each.any? { |t| matcher =~ t.name } end !!should_filter diff --git a/spec/lib/feed_manager_spec.rb b/spec/lib/feed_manager_spec.rb index 715d85306..0e4968440 100644 --- a/spec/lib/feed_manager_spec.rb +++ b/spec/lib/feed_manager_spec.rb @@ -164,6 +164,22 @@ RSpec.describe FeedManager do expect(FeedManager.instance.filter?(:home, reblog, alice.id)).to be true end + + it 'returns true for a status with a tag that matches a muted keyword' do + Fabricate('Glitch::KeywordMute', account: alice, keyword: 'jorts') + status = Fabricate(:status, account: bob) + status.tags << Fabricate(:tag, name: 'jorts') + + expect(FeedManager.instance.filter?(:home, status, alice.id)).to be true + end + + it 'returns true for a status with a tag that matches an octothorpe-prefixed muted keyword' do + Fabricate('Glitch::KeywordMute', account: alice, keyword: '#jorts') + status = Fabricate(:status, account: bob) + status.tags << Fabricate(:tag, name: 'jorts') + + expect(FeedManager.instance.filter?(:home, status, alice.id)).to be true + end end context 'for mentions feed' do From cb4ef24ac9d48e70648135f106fdc275dedf14fc Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 17:26:29 -0600 Subject: [PATCH 2/5] Match keyword mute filter on hashtags. #208. It is reasonable to expect someone to enter #foo to mute hashtag #foo. However, tags are recorded on statuses without the preceding #. To adjust for this, we build a separate tag matcher and use Tag::HASHTAG_RE to extract a hashtag from the hashtag syntax. --- app/lib/feed_manager.rb | 21 ++++--- app/models/glitch/keyword_mute.rb | 76 ++++++++++++++++++------- spec/models/glitch/keyword_mute_spec.rb | 4 +- 3 files changed, 69 insertions(+), 32 deletions(-) diff --git a/app/lib/feed_manager.rb b/app/lib/feed_manager.rb index 414632a8a..e12a5c016 100644 --- a/app/lib/feed_manager.rb +++ b/app/lib/feed_manager.rb @@ -141,7 +141,7 @@ class FeedManager return false if receiver_id == status.account_id return true if status.reply? && (status.in_reply_to_id.nil? || status.in_reply_to_account_id.nil?) - return true if keyword_filter?(status, Glitch::KeywordMute.matcher_for(receiver_id)) + return true if keyword_filter?(status, receiver_id) check_for_mutes = [status.account_id] check_for_mutes.concat(status.mentions.pluck(:account_id)) @@ -170,17 +170,20 @@ class FeedManager false end - def keyword_filter?(status, matcher) - should_filter = matcher =~ status.text - should_filter ||= matcher =~ status.spoiler_text - should_filter ||= status.tags.find_each.any? { |t| matcher =~ t.name } + def keyword_filter?(status, receiver_id) + text_matcher = Glitch::KeywordMute.text_matcher_for(receiver_id) + tag_matcher = Glitch::KeywordMute.tag_matcher_for(receiver_id) + + should_filter = text_matcher =~ status.text + should_filter ||= text_matcher =~ status.spoiler_text + should_filter ||= tag_matcher =~ status.tags if status.reblog? reblog = status.reblog - should_filter ||= matcher =~ reblog.text - should_filter ||= matcher =~ reblog.spoiler_text - should_filter ||= reblog.tags.find_each.any? { |t| matcher =~ t.name } + should_filter ||= text_matcher =~ reblog.text + should_filter ||= text_matcher =~ reblog.spoiler_text + should_filter ||= tag_matcher =~ status.tags end !!should_filter @@ -195,7 +198,7 @@ class FeedManager should_filter = Block.where(account_id: receiver_id, target_account_id: check_for_blocks).any? # Filter if it's from someone I blocked, in reply to someone I blocked, or mentioning someone I blocked should_filter ||= (status.account.silenced? && !Follow.where(account_id: receiver_id, target_account_id: status.account_id).exists?) # of if the account is silenced and I'm not following them - should_filter ||= keyword_filter?(status, Glitch::KeywordMute.matcher_for(receiver_id)) # or if the mention contains a muted keyword + should_filter ||= keyword_filter?(status, receiver_id) # or if the mention contains a muted keyword should_filter end diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 009de1880..733dd0bc8 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -16,44 +16,37 @@ class Glitch::KeywordMute < ApplicationRecord validates_presence_of :keyword - after_commit :invalidate_cached_matcher + after_commit :invalidate_cached_matchers - def self.matcher_for(account_id) - Matcher.new(account_id) + def self.text_matcher_for(account_id) + TextMatcher.new(account_id) + end + + def self.tag_matcher_for(account_id) + TagMatcher.new(account_id) end private - def invalidate_cached_matcher - Rails.cache.delete("keyword_mutes:regex:#{account_id}") + def invalidate_cached_matchers + Rails.cache.delete(TextMatcher.cache_key(account_id)) + Rails.cache.delete(TagMatcher.cache_key(account_id)) end - class Matcher + class RegexpMatcher attr_reader :account_id attr_reader :regex def initialize(account_id) @account_id = account_id - regex_text = Rails.cache.fetch("keyword_mutes:regex:#{account_id}") { regex_text_for_account } + regex_text = Rails.cache.fetch(self.class.cache_key(account_id)) { make_regex_text } @regex = /#{regex_text}/ end - def =~(str) - regex =~ str - end - - private + protected def keywords - Glitch::KeywordMute.where(account_id: account_id).select(:keyword, :id, :whole_word) - end - - def regex_text_for_account - kws = keywords.find_each.with_object([]) do |kw, a| - a << (kw.whole_word ? boundary_regex_for_keyword(kw.keyword) : kw.keyword) - end - - Regexp.union(kws).source + Glitch::KeywordMute.where(account_id: account_id).pluck(:whole_word, :keyword) end def boundary_regex_for_keyword(keyword) @@ -63,4 +56,45 @@ class Glitch::KeywordMute < ApplicationRecord /(?mix:#{sb}#{Regexp.escape(keyword)}#{eb})/ end end + + class TextMatcher < RegexpMatcher + def self.cache_key(account_id) + format('keyword_mutes:regex:%s', account_id) + end + + def =~(str) + regex =~ str + end + + private + + def make_regex_text + kws = keywords.map! do |whole_word, keyword| + whole_word ? boundary_regex_for_keyword(keyword) : keyword + end + + Regexp.union(kws).source + end + end + + class TagMatcher < RegexpMatcher + def self.cache_key(account_id) + format('keyword_mutes:tag:%s', account_id) + end + + def =~(tags) + tags.pluck(:name).detect { |n| regex =~ n } + end + + private + + def make_regex_text + kws = keywords.map! do |whole_word, keyword| + term = (Tag::HASHTAG_RE =~ keyword) ? $1 : keyword + whole_word ? boundary_regex_for_keyword(term) : term + end + + Regexp.union(kws).source + end + end end diff --git a/spec/models/glitch/keyword_mute_spec.rb b/spec/models/glitch/keyword_mute_spec.rb index 9685c6493..e14af0e6a 100644 --- a/spec/models/glitch/keyword_mute_spec.rb +++ b/spec/models/glitch/keyword_mute_spec.rb @@ -4,8 +4,8 @@ RSpec.describe Glitch::KeywordMute, type: :model do let(:alice) { Fabricate(:account, username: 'alice').tap(&:save!) } let(:bob) { Fabricate(:account, username: 'bob').tap(&:save!) } - describe '.matcher_for' do - let(:matcher) { Glitch::KeywordMute.matcher_for(alice) } + describe '.text_matcher_for' do + let(:matcher) { Glitch::KeywordMute.text_matcher_for(alice.id) } describe 'with no mutes' do before do From 8fc54890e50de024683d9b763b8ed8f82d46433a Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 18:08:03 -0600 Subject: [PATCH 3/5] Prefix cache keys with the matcher type. #208. We already know about one regex limitation, which is that they cannot segment words in e.g. Japanese, Chinese, or Thai. It may also end up that regex matching is too slow compared to other methods. However, the regex is an implementation detail. We still want the ability to switch between "occurs anywhere" and "match whole word", and caching the matcher result is likely to still be important (since the matcher itself won't change nearly as often as status ingress rate). Therefore, we ought to be able to change the cache keys to reflect a change of data structure. (Old cache keys expire within minutes, so they shouldn't be too big of an issue. Old cache keys could also be explicitly removed by an instance administrator.) --- app/models/glitch/keyword_mute.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 733dd0bc8..001be9201 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -59,7 +59,7 @@ class Glitch::KeywordMute < ApplicationRecord class TextMatcher < RegexpMatcher def self.cache_key(account_id) - format('keyword_mutes:regex:%s', account_id) + format('keyword_mutes:regex:text:%s', account_id) end def =~(str) @@ -79,7 +79,7 @@ class Glitch::KeywordMute < ApplicationRecord class TagMatcher < RegexpMatcher def self.cache_key(account_id) - format('keyword_mutes:tag:%s', account_id) + format('keyword_mutes:regex:tag:%s', account_id) end def =~(tags) From 08652baab018e65cfc4e1fbcfc909ff01f96ea60 Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 18:26:21 -0600 Subject: [PATCH 4/5] Replace =~ with #matches?. #208. =~ made sense when we were passing it through to a regex, but we're no longer doing that: TagMatcher looks at individual tags and returns a value that *looks* like what you get out of #=~ but really isn't that meaningful. Probably a good idea to not subvert convention like this and instead use a name with guessable intent. --- app/lib/feed_manager.rb | 14 ++++++------- app/models/glitch/keyword_mute.rb | 8 +++---- spec/models/glitch/keyword_mute_spec.rb | 28 ++++++++++++------------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/app/lib/feed_manager.rb b/app/lib/feed_manager.rb index e12a5c016..a99606e1b 100644 --- a/app/lib/feed_manager.rb +++ b/app/lib/feed_manager.rb @@ -174,19 +174,19 @@ class FeedManager text_matcher = Glitch::KeywordMute.text_matcher_for(receiver_id) tag_matcher = Glitch::KeywordMute.tag_matcher_for(receiver_id) - should_filter = text_matcher =~ status.text - should_filter ||= text_matcher =~ status.spoiler_text - should_filter ||= tag_matcher =~ status.tags + should_filter = text_matcher.matches?(status.text) + should_filter ||= text_matcher.matches?(status.spoiler_text) + should_filter ||= tag_matcher.matches?(status.tags) if status.reblog? reblog = status.reblog - should_filter ||= text_matcher =~ reblog.text - should_filter ||= text_matcher =~ reblog.spoiler_text - should_filter ||= tag_matcher =~ status.tags + should_filter ||= text_matcher.matches?(reblog.text) + should_filter ||= text_matcher.matches?(reblog.spoiler_text) + should_filter ||= tag_matcher.matches?(status.tags) end - !!should_filter + should_filter end def filter_from_mentions?(status, receiver_id) diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 001be9201..a2481308f 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -62,8 +62,8 @@ class Glitch::KeywordMute < ApplicationRecord format('keyword_mutes:regex:text:%s', account_id) end - def =~(str) - regex =~ str + def matches?(str) + !!(regex =~ str) end private @@ -82,8 +82,8 @@ class Glitch::KeywordMute < ApplicationRecord format('keyword_mutes:regex:tag:%s', account_id) end - def =~(tags) - tags.pluck(:name).detect { |n| regex =~ n } + def matches?(tags) + tags.pluck(:name).any? { |n| regex =~ n } end private diff --git a/spec/models/glitch/keyword_mute_spec.rb b/spec/models/glitch/keyword_mute_spec.rb index e14af0e6a..bab276c26 100644 --- a/spec/models/glitch/keyword_mute_spec.rb +++ b/spec/models/glitch/keyword_mute_spec.rb @@ -13,7 +13,7 @@ RSpec.describe Glitch::KeywordMute, type: :model do end it 'does not match' do - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end end @@ -21,75 +21,75 @@ RSpec.describe Glitch::KeywordMute, type: :model do it 'does not match keywords set by a different account' do Glitch::KeywordMute.create!(account: bob, keyword: 'take') - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end it 'does not match if no keywords match the status text' do Glitch::KeywordMute.create!(account: alice, keyword: 'cold') - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end it 'considers word boundaries when matching' do Glitch::KeywordMute.create!(account: alice, keyword: 'bob', whole_word: true) - expect(matcher =~ 'bobcats').to be_falsy + expect(matcher.matches?('bobcats')).to be_falsy end it 'matches substrings if whole_word is false' do Glitch::KeywordMute.create!(account: alice, keyword: 'take', whole_word: false) - expect(matcher =~ 'This is a shiitake mushroom').to be_truthy + expect(matcher.matches?('This is a shiitake mushroom')).to be_truthy end it 'matches keywords at the beginning of the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'take') - expect(matcher =~ 'Take this').to be_truthy + expect(matcher.matches?('Take this')).to be_truthy end it 'matches keywords at the end of the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'take') - expect(matcher =~ 'This is a hot take').to be_truthy + expect(matcher.matches?('This is a hot take')).to be_truthy end it 'matches if at least one keyword case-insensitively matches the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') - expect(matcher =~ 'This is a HOT take').to be_truthy + expect(matcher.matches?('This is a HOT take')).to be_truthy end it 'maintains case-insensitivity when combining keywords into a single matcher' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') Glitch::KeywordMute.create!(account: alice, keyword: 'cold') - expect(matcher =~ 'This is a HOT take').to be_truthy + expect(matcher.matches?('This is a HOT take')).to be_truthy end it 'matches keywords surrounded by non-alphanumeric ornamentation' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') - expect(matcher =~ '(hot take)').to be_truthy + expect(matcher.matches?('(hot take)')).to be_truthy end it 'escapes metacharacters in keywords' do Glitch::KeywordMute.create!(account: alice, keyword: '(hot take)') - expect(matcher =~ '(hot take)').to be_truthy + expect(matcher.matches?('(hot take)')).to be_truthy end it 'uses case-folding rules appropriate for more than just English' do Glitch::KeywordMute.create!(account: alice, keyword: 'großeltern') - expect(matcher =~ 'besuch der grosseltern').to be_truthy + expect(matcher.matches?('besuch der grosseltern')).to be_truthy end it 'matches keywords that are composed of multiple words' do Glitch::KeywordMute.create!(account: alice, keyword: 'a shiitake') - expect(matcher =~ 'This is a shiitake').to be_truthy - expect(matcher =~ 'This is shiitake').to_not be_truthy + expect(matcher.matches?('This is a shiitake')).to be_truthy + expect(matcher.matches?('This is shiitake')).to_not be_truthy end end end From c2a92dffc920cda6985dce0a0f77ae25b85659aa Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 23:31:49 -0600 Subject: [PATCH 5/5] Add some examples for Glitch::KeywordMute::TagMatcher. #208. --- spec/models/glitch/keyword_mute_spec.rb | 61 +++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/spec/models/glitch/keyword_mute_spec.rb b/spec/models/glitch/keyword_mute_spec.rb index bab276c26..0ffc7b18f 100644 --- a/spec/models/glitch/keyword_mute_spec.rb +++ b/spec/models/glitch/keyword_mute_spec.rb @@ -93,4 +93,65 @@ RSpec.describe Glitch::KeywordMute, type: :model do end end end + + describe '.tag_matcher_for' do + let(:matcher) { Glitch::KeywordMute.tag_matcher_for(alice.id) } + let(:status) { Fabricate(:status) } + + describe 'with no mutes' do + before do + Glitch::KeywordMute.delete_all + end + + it 'does not match' do + status.tags << Fabricate(:tag, name: 'xyzzy') + + expect(matcher.matches?(status.tags)).to be false + end + end + + describe 'with mutes' do + it 'does not match keywords set by a different account' do + status.tags << Fabricate(:tag, name: 'xyzzy') + Glitch::KeywordMute.create!(account: bob, keyword: 'take') + + expect(matcher.matches?(status.tags)).to be false + end + + it 'matches #xyzzy when given the mute "#xyzzy"' do + status.tags << Fabricate(:tag, name: 'xyzzy') + Glitch::KeywordMute.create!(account: alice, keyword: '#xyzzy') + + expect(matcher.matches?(status.tags)).to be true + end + + it 'matches #thingiverse when given the non-whole-word mute "#thing"' do + status.tags << Fabricate(:tag, name: 'thingiverse') + Glitch::KeywordMute.create!(account: alice, keyword: '#thing', whole_word: false) + + expect(matcher.matches?(status.tags)).to be true + end + + it 'matches #hashtag when given the mute "##hashtag""' do + status.tags << Fabricate(:tag, name: 'hashtag') + Glitch::KeywordMute.create!(account: alice, keyword: '##hashtag') + + expect(matcher.matches?(status.tags)).to be true + end + + it 'matches #oatmeal when given the non-whole-word mute "oat"' do + status.tags << Fabricate(:tag, name: 'oatmeal') + Glitch::KeywordMute.create!(account: alice, keyword: 'oat', whole_word: false) + + expect(matcher.matches?(status.tags)).to be true + end + + it 'does not match #oatmeal when given the mute "#oat"' do + status.tags << Fabricate(:tag, name: 'oatmeal') + Glitch::KeywordMute.create!(account: alice, keyword: 'oat') + + expect(matcher.matches?(status.tags)).to be false + end + end + end end