Moved to using a normalized text column for searches. Admins using an FTS-enabled version of Monsterfork will need to apply the migration from `dist/search.sql` then run `bundle exec rails monsterfork:index_statuses`.

master
multiple creatures 2019-11-16 21:01:07 -06:00
parent 487c945d16
commit 1132af1515
11 changed files with 87 additions and 11 deletions

View File

@ -153,3 +153,5 @@ gem 'concurrent-ruby', require: false
gem "ruby-bbcode", "~> 2.0"
gem "sun_calc", "~> 0.1.0"
gem "sixarm_ruby_unaccent", "~> 1.2"

View File

@ -579,6 +579,7 @@ GEM
json (>= 1.8, < 3)
simplecov-html (~> 0.10.0)
simplecov-html (0.10.2)
sixarm_ruby_unaccent (1.2.0)
sprockets (3.7.2)
concurrent-ruby (~> 1.0)
rack (> 1, < 3)
@ -763,6 +764,7 @@ DEPENDENCIES
simple-navigation (~> 4.0)
simple_form (~> 4.1)
simplecov (~> 0.16)
sixarm_ruby_unaccent (~> 1.2)
sprockets-rails (~> 3.2)
stackprof
stoplight (~> 2.1.3)

View File

@ -21,7 +21,7 @@ module FilterHelper
return false if filters.empty?
status = status.reblog if status.reblog?
status_text = Formatter.instance.plaintext(status)
status_text = status.normalized_text
spoiler_text = status.spoiler_text
tags = status.tags.pluck(:name).join("\n")
descs = status.media_attachments.map { |a| a.description }.join("\n").strip

View File

@ -1,6 +1,7 @@
module SearchHelper
require 'sixarm_ruby_unaccent'
module SearchHelper
def expand_search_query(query)
query.gsub(/"(.*)"/, '\\y\1\\y')
query.downcase.unaccent.gsub(/"(.*)"/, '\\y\1\\y')
end
end

View File

@ -0,0 +1,31 @@
# coding: utf-8
require 'htmlentities'
require 'sixarm_ruby_unaccent'
module TextHelper
def normalize_text(html)
t = html.downcase
t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
t.gsub!(/<[bh]r[\/ ]*>/, "\n")
t.gsub!(/<\/?[^>]*>/, '')
t = HTMLEntities.new.decode(t)
t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ')
t.gsub!(/ +/, ' ')
t.gsub!(/\r\n?/, "\n")
t.gsub!(/\n[ \t]+/, "\n")
t.gsub!(/[ \t]+\n/, "\n")
t.gsub!(/\n\n+/, "\n")
t.unaccent_via_split_map.strip
end
def normalize_status(status)
return normalize_text("#{status.spoiler_text}\n#{status.text}") unless status.local?
normalize_text("#{status.spoiler_text}\n#{Formatter.instance.format(status)}")
end
end

View File

@ -720,7 +720,7 @@ class Bangtags
q = cmd[1..-1].join.strip
next if q.blank?
begin
data = @account.statuses.where('text ~* ?', expand_search_query(q))
data = @account.statuses.where('normalized_text ~ ?', expand_search_query(q))
.reorder(:created_at)
.pluck(:created_at)
.map { |d| d.strftime('%Y-%m') }

View File

@ -31,9 +31,9 @@
# edited :boolean
# imported :boolean
# origin :string
# tsv :tsvector
# boostable :boolean
# reject_replies :boolean
# normalized_text :text default(""), not null
#
class Status < ApplicationRecord
@ -43,6 +43,7 @@ class Status < ApplicationRecord
include Streamable
include Cacheable
include StatusThreadingConcern
include TextHelper
# match both with and without U+FE0F (the emoji variation selector)
LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/
@ -324,6 +325,7 @@ class Status < ApplicationRecord
around_create Mastodon::Snowflake::Callbacks
before_create :set_locality
before_create :update_normalized_text
before_validation :prepare_contents, if: :local?
before_validation :set_reblog
@ -334,6 +336,9 @@ class Status < ApplicationRecord
after_create :set_poll_id
after_create :process_bangtags, if: :local?
after_create :update_normalized_text
after_update :update_normalized_text
class << self
include SearchHelper
@ -350,7 +355,7 @@ class Status < ApplicationRecord
end
return none if term.blank? || term.length < 3
query = query.without_reblogs
.where('text ~* ?', expand_search_query(term))
.where('normalized_text ~ ?', expand_search_query(term))
.offset(offset).limit(limit)
apply_timeline_filters(query, account, true)
rescue ActiveRecord::StatementInvalid
@ -618,6 +623,12 @@ class Status < ApplicationRecord
Bangtags.new(self).process
end
def update_normalized_text
return unless (normalized_text.blank? && !text.blank?) || saved_change_to_text?
Rails.cache.delete("formatted_status:#{status.id}")
self.normalized_text = normalize_status(self)
end
def set_conversation
self.thread = thread.reblog if thread&.reblog?

View File

@ -0,0 +1,5 @@
class AddNormalizedTextToStatuses < ActiveRecord::Migration[5.2]
def change
add_column :statuses, :normalized_text, :text, null: false, default: ''
end
end

View File

@ -10,10 +10,12 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2019_10_27_182731) do
ActiveRecord::Schema.define(version: 2019_11_16_233416) do
# These are extensions that must be enabled in order to support this database
enable_extension "pg_trgm"
enable_extension "plpgsql"
enable_extension "unaccent"
create_table "account_conversations", force: :cascade do |t|
t.bigint "account_id"
@ -697,9 +699,9 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do
t.boolean "edited"
t.boolean "imported"
t.string "origin"
t.tsvector "tsv"
t.boolean "boostable"
t.boolean "reject_replies"
t.text "normalized_text", default: "", null: false
t.index ["account_id", "id", "visibility", "updated_at"], name: "index_statuses_20180106", order: { id: :desc }
t.index ["account_id", "id", "visibility"], name: "index_statuses_on_account_id_and_id_and_visibility", order: { id: :desc }, where: "(visibility = ANY (ARRAY[0, 1, 2, 4]))"
t.index ["in_reply_to_account_id"], name: "index_statuses_on_in_reply_to_account_id"
@ -707,7 +709,8 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do
t.index ["network"], name: "index_statuses_on_network", where: "network"
t.index ["origin"], name: "index_statuses_on_origin", unique: true
t.index ["reblog_of_id", "account_id"], name: "index_statuses_on_reblog_of_id_and_account_id"
t.index ["tsv"], name: "tsv_idx", using: :gin
t.index ["spoiler_text"], name: "index_statuses_on_spoiler_text_trgm", opclass: :gin_trgm_ops, using: :gin
t.index ["text"], name: "index_statuses_on_text_trgm", opclass: :gin_trgm_ops, using: :gin
t.index ["uri"], name: "index_statuses_on_uri", unique: true
end

8
dist/search.sql vendored
View File

@ -12,7 +12,11 @@ DROP TRIGGER IF EXISTS tsvectorupdate ON statuses;
DROP FUNCTION IF EXISTS tsv_update_trigger;
DROP INDEX IF EXISTS tsv_idx;
ALTER TABLE statuses DROP COLUMN IF EXISTS tsv;
DROP INDEX IF EXISTS index_statuses_on_text_trgm;
DROP INDEX IF EXISTS index_statuses_on_spoiler_text_trgm;
-- Create new trigram indexes --
CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_text_trgm ON statuses USING GIN (text gin_trgm_ops);
CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_spoiler_text_trgm ON statuses USING GIN (spoiler_text gin_trgm_ops);
CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_normalized_text_trgm ON statuses USING GIN (normalized_text gin_trgm_ops);
-- Compact tables ---
VACUUM ANALYZE;

View File

@ -0,0 +1,17 @@
namespace :monsterfork do
desc '(Re-)Index statuses for search.'
task index_statuses: :environment do
include TextHelper
i = 0
total = Status.count
Status.find_in_batches do |statuses|
ActiveRecord::Base.logger.info("Indexing statuses #{1+i}-#{statuses.count} of #{total}.")
i += statuses.count
statuses.each do |s|
ActiveRecord::Base.logger.silence { s.update_column(:normalized_text, normalize_status(s)) }
end
end
end
end