bon voyage to that shitty text normalization code
parent
f03960382b
commit
a29fb04e7c
|
@ -1,58 +0,0 @@
|
|||
# coding: utf-8
|
||||
require 'htmlentities'
|
||||
require 'sixarm_ruby_unaccent'
|
||||
|
||||
module TextHelper
|
||||
|
||||
def html2text(html)
|
||||
html = html
|
||||
.gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
|
||||
.gsub(/<[bh]r[\/ ]*>/, "\n")
|
||||
.gsub(/<\/?[^>]*>/, '')
|
||||
|
||||
HTMLEntities.new.decode(html)
|
||||
end
|
||||
|
||||
def normalize_text(text)
|
||||
text.downcase
|
||||
.gsub(Account::MENTION_RE, '')
|
||||
.gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '')
|
||||
.gsub(/\s*\302\240+\s*/, ' ')
|
||||
.gsub(/\n\s+|\s+\n/, "\n")
|
||||
.gsub(/\r\n?/, "\n")
|
||||
.gsub(/\n\n+/, "\n")
|
||||
.unaccent_via_split_map
|
||||
.gsub(/(?:htt|ft)ps?:\/\//, '')
|
||||
.gsub(/[^\n\p{Word} [:punct:]]/, '')
|
||||
.gsub(/ +/, ' ')
|
||||
.strip
|
||||
end
|
||||
|
||||
def normalize_status(status)
|
||||
"#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".strip
|
||||
end
|
||||
|
||||
def _format_tags(status)
|
||||
return unless status.tags.present?
|
||||
"tag #{status.tags.pluck(:name).join("\ntag ")}"
|
||||
end
|
||||
|
||||
def _format_spoiler(status)
|
||||
return if status.spoiler_text.blank?
|
||||
"subj #{normalize_text(status.spoiler_text)}"
|
||||
end
|
||||
|
||||
def _format_status(status)
|
||||
text = status.local? ? Formatter.instance.format(status) : status.text
|
||||
return if text.blank?
|
||||
text = normalize_text(html2text(text))
|
||||
text.gsub!("\n", "\ntext ")
|
||||
"text #{text}"
|
||||
end
|
||||
|
||||
def _format_desc(status)
|
||||
return unless status.media_attachments.present?
|
||||
text = status.media_attachments.pluck(:description).compact.join("\ndesc ")
|
||||
"desc #{normalize_text(text)}"
|
||||
end
|
||||
end
|
|
@ -40,7 +40,6 @@ class Status < ApplicationRecord
|
|||
include Streamable
|
||||
include Cacheable
|
||||
include StatusThreadingConcern
|
||||
include TextHelper
|
||||
|
||||
# match both with and without U+FE0F (the emoji variation selector)
|
||||
LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/
|
||||
|
@ -358,7 +357,6 @@ class Status < ApplicationRecord
|
|||
|
||||
after_save :update_sharekey, if: :local?
|
||||
after_save :update_origin, if: :local?
|
||||
after_save :update_normalized_text
|
||||
after_save :process_bangtags, if: :local?
|
||||
|
||||
class << self
|
||||
|
@ -680,16 +678,6 @@ class Status < ApplicationRecord
|
|||
end
|
||||
end
|
||||
|
||||
def update_normalized_text
|
||||
return if destroyed? || text.blank? || !(text_changed? || saved_change_to_text?)
|
||||
normalized_text = normalize_status(self)
|
||||
if self.normalized_status.nil?
|
||||
self.create_normalized_status(text: normalized_text)
|
||||
else
|
||||
self.normalized_status.update_attributes(text: normalized_text)
|
||||
end
|
||||
end
|
||||
|
||||
def set_conversation
|
||||
self.thread = thread.reblog if thread&.reblog?
|
||||
|
||||
|
|
|
@ -1,48 +1,6 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
def index_statuses(statuses_query)
|
||||
include TextHelper
|
||||
|
||||
i = 0
|
||||
total = statuses_query.count
|
||||
|
||||
statuses_query.find_in_batches do |statuses|
|
||||
ActiveRecord::Base.logger.info("Indexing status #{1+i} of #{total}.")
|
||||
ActiveRecord::Base.logger.silence do
|
||||
i += statuses.count
|
||||
statuses.each do |s|
|
||||
begin
|
||||
next if s.destroyed?
|
||||
normalized_text = normalize_status(s)
|
||||
if s.normalized_status.nil?
|
||||
s.create_normalized_status(text: normalized_text)
|
||||
elsif s.normalized_status.text != normalized_text
|
||||
s.normalized_status.update_column(:text, normalized_text)
|
||||
end
|
||||
rescue ActiveRecord::RecordNotFound
|
||||
true
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
namespace :monsterfork do
|
||||
desc 'Index statuses for search that have not been indexed yet.'
|
||||
task index_statuses: :environment do
|
||||
index_statuses(Status.where(normalized_text: ''))
|
||||
end
|
||||
|
||||
desc 'Reindex all statuses for search.'
|
||||
task reindex_statuses: :environment do
|
||||
index_statuses(Status)
|
||||
end
|
||||
|
||||
desc 'Reindex statuses containing media with descriptions for search.'
|
||||
task reindex_media_descs: :environment do
|
||||
index_statuses(Status.left_outer_joins(:media_attachments).where('media_attachments.description IS NOT NULL'))
|
||||
end
|
||||
|
||||
desc "Re-apply all users' filters to their home and list timelines."
|
||||
task reapply_filters: :environment do
|
||||
Account.local.find_each do |account|
|
||||
|
|
Loading…
Reference in New Issue