switch (back) to postgres fts engine for fast search & timeline filters

master
multiple creatures 2020-01-10 03:45:29 -06:00
parent 4c8591fbea
commit f03960382b
13 changed files with 5455 additions and 83 deletions

View File

@ -7,7 +7,7 @@ module FilterHelper
status = status.reblog if status.reblog?
if Status.where(id: status.id).regex_filtered_by_account(receiver_id).exists?
if Status.where(id: status.id).search_filtered_by_account(receiver_id).exists?
redis.sadd("filtered_statuses:#{receiver_id}", status.id)
return true
end

View File

@ -1,20 +0,0 @@
require 'sixarm_ruby_unaccent'
module SearchHelper
def expand_search_query(query)
return '' if query.blank?
query = query.downcase.unaccent.gsub(/[^\p{Word} [:punct:]]/, '').gsub(/ +/, ' ').strip
return '' if query.blank?
if query.include?(':')
query_parts = query.split(':', 2)
if %w(tag tags).include?(query_parts[0])
query = "^tag (#{query_parts[1].split.join('|')})"
elsif %w(subj text desc).include?(query_parts[0])
query = "^#{query_parts[0]} .*#{query_parts[1]}"
end
end
query.gsub(/"(.*)"/, '\\y\1\\y')
end
end

View File

@ -3,7 +3,6 @@
class Bangtags
include ModerationHelper
include ServiceAccountHelper
include SearchHelper
attr_reader :status, :account
@ -764,7 +763,7 @@ class Bangtags
q = cmd[1..-1].join.strip
next if q.blank?
begin
data = @account.statuses.regex(expand_search_query(q))
data = @account.statuses.search(q.unaccent)
.reorder(:created_at)
.pluck(:created_at)
.map { |d| d.strftime('%Y-%m') }

View File

@ -1,13 +0,0 @@
# == Schema Information
#
# Table name: normalized_statuses
#
# id :bigint(8) not null, primary key
# status_id :bigint(8)
# text :text
#
class NormalizedStatus < ApplicationRecord
belongs_to :status, inverse_of: :normalized_status
validates_uniqueness_of :status_id
end

View File

@ -30,6 +30,7 @@
# edited :boolean
# boostable :boolean
# reject_replies :boolean
# tsv :tsvector
#
class Status < ApplicationRecord
@ -82,7 +83,6 @@ class Status < ApplicationRecord
has_one :status_stat, inverse_of: :status
has_one :poll, inverse_of: :status, dependent: :destroy
has_one :destructing_status, inverse_of: :status, dependent: :destroy
has_one :normalized_status, inverse_of: :status, dependent: :destroy
has_one :imported_status, inverse_of: :status, dependent: :destroy
has_one :sharekey, inverse_of: :status, dependent: :destroy
@ -118,10 +118,10 @@ class Status < ApplicationRecord
scope :mention_not_excluded_by_account, ->(account) { left_outer_joins(:mentions).where('mentions.account_id IS NULL OR mentions.account_id NOT IN (?)', account.excluded_from_timeline_account_ids) }
scope :not_domain_blocked_by_account, ->(account) { account.excluded_from_timeline_domains.blank? ? left_outer_joins(:account) : left_outer_joins(:account).where('accounts.domain IS NULL OR accounts.domain NOT IN (?)', account.excluded_from_timeline_domains) }
scope :like, ->(needle) { joins(:normalized_status).select('statuses.*').where('normalized_statuses.text LIKE f_normalize(?)', needle) }
scope :regex, ->(needle) { joins(:normalized_status).select('statuses.*').where('normalized_statuses.text ~ f_normalize(?)', needle) }
scope :regex_filtered_by_account, ->(account_id) { joins(:normalized_status).select('statuses.*').where('normalized_statuses.text ~ ANY(ARRAY(SELECT f_normalize(phrase) FROM custom_filters WHERE account_id = ?))', account_id) }
scope :regex_not_filtered_by_account, ->(account_id) { joins(:normalized_status).select('statuses.*').where('normalized_statuses.text !~ ALL(ARRAY(SELECT f_normalize(phrase) FROM custom_filters WHERE account_id = ?))', account_id) }
scope :search, ->(needle) { where("tsv @@ websearch_to_tsquery('fedi', ?)", needle) }
scope :search_not, ->(needle) { where.not("tsv @@ websearch_to_tsquery('fedi', ?)", needle) }
scope :search_filtered_by_account, ->(account_id) { where('tsv @@ (SELECT tsquery_union(websearch_to_tsquery(phrase)) FROM custom_filters WHERE account_id = ?)', account_id) }
scope :search_not_filtered_by_account, ->(account_id) { where.not('tsv @@ (SELECT tsquery_union(websearch_to_tsquery(phrase)) FROM custom_filters WHERE account_id = ?)', account_id) }
scope :not_missing_media_desc, -> { left_outer_joins(:media_attachments).select('statuses.*').where('media_attachments.id IS NULL OR media_attachments.description IS NOT NULL') }
@ -362,8 +362,6 @@ class Status < ApplicationRecord
after_save :process_bangtags, if: :local?
class << self
include SearchHelper
def search_for(term, account = nil, limit = 33, offset = 0)
return none if account.nil?
if term.start_with?('me:')
@ -371,12 +369,13 @@ class Status < ApplicationRecord
query = account.statuses
else
query = Status.where(account_id: account.id)
.or(Status.where(account_id: account.following, visibility: [:private, :local, :unlisted]))
.or(Status.where(visibility: [:local, :public]))
.or(Status.where(account_id: account.following, visibility: [:private, :unlisted]))
.or(Status.where(id: account.mentions.select(:status_id)))
end
return none if term.blank? || term.length < 3
return none if term.blank?
query = query.without_reblogs
.regex(expand_search_query(term))
.search(term.unaccent)
.offset(offset).limit(limit)
apply_timeline_filters(query, account, true)
rescue ActiveRecord::StatementInvalid
@ -583,9 +582,9 @@ class Status < ApplicationRecord
query = query.mention_not_excluded_by_account(account)
unless account.custom_filters.nil?
if account.user.invert_filters
query = query.regex_filtered_by_account(account.id)
query = query.search_filtered_by_account(account.id)
else
query = query.regex_not_filtered_by_account(account.id)
query = query.search_not_filtered_by_account(account.id)
end
end
query = query.not_missing_media_desc if account.filter_undescribed?

View File

@ -45,7 +45,7 @@ en:
setting_skin: Reskins the selected Mastodon flavour
setting_theme: Affects how Mastodon looks when you're logged in from any device.
username: Your username will be unique on %{domain}
phrase_html: "<code>&quot;thing&quot;</code> - match whole words<br/><code>tags: &quot;tag1&quot; tag2 ...</code> - match tags (don't include <code>#</code>)<br/><code>subj: thing</code> - match subject or CW</code><br/><code>text: thing</code> - match text<br/><code>desc: thing</code> - match media descriptions"
phrase_html: "<strong>Examples</strong><br>Containing any terms: <code>this OR that</code><br>Containing all terms: <code>this that</code>, <code>this AND that</code><br>Containing an exact term: <code>&quot;this thing&quot;</code><br>Grouping: <code>this OR (&quot;this thing&quot; AND &quot;that thing&quot;)</code>"
featured_tag:
name: 'You might want to use one of these:'
imports:
@ -113,7 +113,7 @@ en:
note: Bio
otp_attempt: Two-factor code
password: Password
phrase: Regular expression
phrase: Filter query
setting_advanced_layout: Enable advanced web interface
setting_aggregate_reblogs: Group repeats in timelines
setting_auto_play_gif: Auto-play animated GIFs

View File

@ -1,23 +0,0 @@
class CreateNormalizedStatuses < ActiveRecord::Migration[5.2]
def up
create_table :normalized_statuses do |t|
t.references :status, null: false, foreign_key: {on_delete: :cascade}, index: {unique: true}
t.text :text
end
safety_assured do
remove_index :statuses, name: 'index_statuses_on_normalized_text_trgm'
execute 'INSERT INTO normalized_statuses (status_id, text) SELECT id, normalized_text FROM statuses'
remove_column :statuses, :normalized_text
end
end
def down
safety_assured do
execute 'UPDATE statuses SET normalized_text = s.text FROM (SELECT status_id, text FROM normalized_statuses) AS s WHERE statuses.id = s.id'
remove_index :normalized_statuses, name: 'index_statuses_on_normalized_text_trgm'
drop_table :normalized_statuses
add_column :statuses, :normalized_text, :text, null: false, default: ''
end
end
end

View File

@ -0,0 +1,42 @@
class MigrateBackToFts < ActiveRecord::Migration[5.2]
def up
if table_exists? :normalized_statuses
remove_index :normalized_statuses, name: 'index_statuses_on_normalized_text_trgm'
drop_table :normalized_statuses
end
safety_assured do
execute <<-SQL.squish
DROP FUNCTION IF EXISTS public.f_normalize;
DROP FUNCTION IF EXISTS public.f_unaccent;
CREATE OR REPLACE FUNCTION public.f_strip_mentions(text)
RETURNS text LANGUAGE sql IMMUTABLE PARALLEL SAFE STRICT AS
$func$
SELECT regexp_replace(
regexp_replace($1, '</?span>', '', 'g'),
'>@[^[:space:]]+<', '><', 'g'
)
$func$;
CREATE OR REPLACE AGGREGATE tsquery_union(tsquery) (
SFUNC = tsquery_or,
STYPE = tsquery,
PARALLEL = SAFE
);
CREATE TEXT SEARCH CONFIGURATION fedi ( COPY = simple );
ALTER TEXT SEARCH CONFIGURATION fedi
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
ALTER TABLE statuses
ADD COLUMN tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('fedi', f_strip_mentions(spoiler_text || ' ' || text))
) STORED;
SQL
end
end
end

View File

@ -0,0 +1,15 @@
class AddIndexToTsv < ActiveRecord::Migration[5.2]
disable_ddl_transaction!
def up
safety_assured do
execute 'CREATE INDEX CONCURRENTLY statuses_text_vector_idx ON statuses USING GIN(tsv)'
end
end
def down
safety_assured do
execute 'DROP INDEX statuses_text_vector_idx ON statuses'
end
end
end

5382
db/structure.sql Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
Fabricator(:normalized_status) do
status nil
text "MyText"
end

View File

@ -1,5 +0,0 @@
require 'rails_helper'
RSpec.describe NormalizedStatus, type: :model do
pending "add some examples to (or delete) #{__FILE__}"
end

View File

@ -424,7 +424,7 @@ const startWorker = (workerId) => {
}
const queries = [
client.query(`SELECT 1 FROM blocks WHERE (account_id = $1 AND target_account_id IN (${placeholders(targetAccountIds, 3)})) OR (account_id = $2 AND target_account_id = $1) UNION SELECT 1 FROM mutes WHERE account_id = $1 AND target_account_id IN (${placeholders(targetAccountIds, 3)}) UNION SELECT 1 FROM normalized_statuses WHERE status_id = $3 AND text ${req.invertFilters ? '!~' : '~'} ANY(ARRAY(SELECT f_normalize(phrase) FROM custom_filters WHERE account_id = $1)) UNION SELECT 1 FROM media_attachments WHERE (1 = (SELECT 1 FROM accounts WHERE id = $1 AND filter_undescribed)) AND status_id = $3 AND description IS NULL LIMIT 1`, [req.accountId, unpackedPayload.account.id, unpackedPayload.id].concat(targetAccountIds)),
client.query(`SELECT 1 FROM blocks WHERE (account_id = $1 AND target_account_id IN (${placeholders(targetAccountIds, 3)})) OR (account_id = $2 AND target_account_id = $1) UNION SELECT 1 FROM mutes WHERE account_id = $1 AND target_account_id IN (${placeholders(targetAccountIds, 3)}) UNION SELECT 1 FROM statuses WHERE id = $3 ${req.invertFilters ? 'AND NOT' : 'AND'} tsv @@ (SELECT tsquery_union(websearch_to_tsquery(phrase)) FROM custom_filters WHERE account_id = $1) UNION SELECT 1 FROM media_attachments WHERE (1 = (SELECT 1 FROM accounts WHERE id = $1 AND filter_undescribed)) AND status_id = $3 AND description IS NULL LIMIT 1`, [req.accountId, unpackedPayload.account.id, unpackedPayload.id].concat(targetAccountIds)),
];
if (accountDomain) {