diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index f76b9037e8..7224087368 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -11,8 +11,8 @@ class SearchIndexer @disabled = false end - def self.scrub_html_for_search(html) - HtmlScrubber.scrub(html) + def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents) + HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics) end def self.inject_extra_terms(raw) @@ -169,18 +169,10 @@ class SearchIndexer DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ - def self.strip_diacritics(str) - s = str.unicode_normalize(:nfkd) - s.gsub!(DIACRITICS, "") - s.strip! - s - end - attr_reader :scrubbed def initialize(strip_diacritics: false) @scrubbed = +"" - # for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam @strip_diacritics = strip_diacritics end @@ -189,7 +181,7 @@ class SearchIndexer me = new(strip_diacritics: strip_diacritics) Nokogiri::HTML::SAX::Parser.new(me).parse("
#{html}
") - me.scrubbed + me.scrubbed.squish end ATTRIBUTES ||= %w{alt title href data-youtube-title} @@ -204,8 +196,15 @@ class SearchIndexer end end + def strip_diacritics(str) + s = str.unicode_normalize(:nfkd) + s.gsub!(DIACRITICS, "") + s.strip! + s + end + def characters(str) - str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics + str = strip_diacritics(str) if @strip_diacritics scrubbed << " #{str} " end end diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 2a26a8c303..cf2f6ef211 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1144,6 +1144,7 @@ en: log_search_queries: "Log search queries performed by users" search_query_log_max_size: "Maximum amount of search queries to keep" search_query_log_max_retention_days: "Maximum amount of time to keep search queries, in days." + search_ignore_accents: "Ignore accents when searching for text." allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off." allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles." unique_posts_mins: "How many minutes before a user can make a post with the same content again" diff --git a/config/site_settings.yml b/config/site_settings.yml index 91d17c9730..e37054a60b 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1431,7 +1431,6 @@ search: zh_TW: 2 ko: 2 ja: 2 - search_tokenize_chinese_japanese_korean: false search_prefer_recent_posts: false search_recent_posts_size: @@ -1446,6 +1445,22 @@ search: search_query_log_max_retention_days: default: 365 # 1 year max: 1825 # 5 years + search_ignore_accents: + default: false + locale_default: + ar: true + ca: true + cs: true + el: true + es: true + fa_IR: true + fr: true + hu: true + pt: true + pt_BR: true + ro: true + sk: true + tr_TR: true uncategorized: version_checks: diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index 04ad93fa13..1f81e43e2c 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -63,13 +63,14 @@ class Search end def self.blurb_for(cooked, term = nil, blurb_length = 200) - cooked = SearchIndexer::HtmlScrubber.scrub(cooked).squish - blurb = nil + cooked = SearchIndexer.scrub_html_for_search(cooked) + if term terms = term.split(/\s+/) blurb = TextHelper.excerpt(cooked, terms.first, radius: blurb_length / 2, seperator: " ") end + blurb = TextHelper.truncate(cooked, length: blurb_length, seperator: " ") if blurb.blank? Sanitize.clean(blurb) end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index a9f309a489..3886eb025a 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -3,6 +3,10 @@ require 'rails_helper' describe SearchIndexer do let(:post_id) { 99 } + def scrub(html, strip_diacritics: false) + SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics) + end + it 'correctly indexes chinese' do SiteSetting.default_locale = 'zh_CN' data = "你好世界" @@ -16,26 +20,26 @@ describe SearchIndexer do it 'extract youtube title' do html = "
" - - scrubbed = SearchIndexer::HtmlScrubber.scrub(html) - - expect(scrubbed).to eq(" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] ") + scrubbed = scrub(html) + expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]") end it 'extract a link' do html = "link" - - scrubbed = SearchIndexer::HtmlScrubber.scrub(html) - - expect(scrubbed).to eq(" http://meta.discourse.org/ link ") + scrubbed = scrub(html) + expect(scrubbed).to eq("http://meta.discourse.org/ link") end - it 'removes diacritics' do + it 'uses ignore_accent setting to strip diacritics' do html = "

HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好

" - scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true) + SiteSetting.search_ignore_accents = true + scrubbed = SearchIndexer.scrub_html_for_search(html) + expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好") - expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ") + SiteSetting.search_ignore_accents = false + scrubbed = SearchIndexer.scrub_html_for_search(html) + expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好") end it "doesn't index local files" do @@ -54,9 +58,9 @@ describe SearchIndexer do HTML - scrubbed = SearchIndexer::HtmlScrubber.scrub(html).gsub(/\s+/, " ") + scrubbed = scrub(html) - expect(scrubbed).to eq(" Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB ") + expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB") end it 'correctly indexes a post according to version' do