diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index f76b9037e8..7224087368 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -11,8 +11,8 @@ class SearchIndexer
@disabled = false
end
- def self.scrub_html_for_search(html)
- HtmlScrubber.scrub(html)
+ def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
+ HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
end
def self.inject_extra_terms(raw)
@@ -169,18 +169,10 @@ class SearchIndexer
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
- def self.strip_diacritics(str)
- s = str.unicode_normalize(:nfkd)
- s.gsub!(DIACRITICS, "")
- s.strip!
- s
- end
-
attr_reader :scrubbed
def initialize(strip_diacritics: false)
@scrubbed = +""
- # for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
@strip_diacritics = strip_diacritics
end
@@ -189,7 +181,7 @@ class SearchIndexer
me = new(strip_diacritics: strip_diacritics)
Nokogiri::HTML::SAX::Parser.new(me).parse("
#{html}
")
- me.scrubbed
+ me.scrubbed.squish
end
ATTRIBUTES ||= %w{alt title href data-youtube-title}
@@ -204,8 +196,15 @@ class SearchIndexer
end
end
+ def strip_diacritics(str)
+ s = str.unicode_normalize(:nfkd)
+ s.gsub!(DIACRITICS, "")
+ s.strip!
+ s
+ end
+
def characters(str)
- str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
+ str = strip_diacritics(str) if @strip_diacritics
scrubbed << " #{str} "
end
end
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index 2a26a8c303..cf2f6ef211 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -1144,6 +1144,7 @@ en:
log_search_queries: "Log search queries performed by users"
search_query_log_max_size: "Maximum amount of search queries to keep"
search_query_log_max_retention_days: "Maximum amount of time to keep search queries, in days."
+ search_ignore_accents: "Ignore accents when searching for text."
allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off."
allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles."
unique_posts_mins: "How many minutes before a user can make a post with the same content again"
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 91d17c9730..e37054a60b 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1431,7 +1431,6 @@ search:
zh_TW: 2
ko: 2
ja: 2
-
search_tokenize_chinese_japanese_korean: false
search_prefer_recent_posts: false
search_recent_posts_size:
@@ -1446,6 +1445,22 @@ search:
search_query_log_max_retention_days:
default: 365 # 1 year
max: 1825 # 5 years
+ search_ignore_accents:
+ default: false
+ locale_default:
+ ar: true
+ ca: true
+ cs: true
+ el: true
+ es: true
+ fa_IR: true
+ fr: true
+ hu: true
+ pt: true
+ pt_BR: true
+ ro: true
+ sk: true
+ tr_TR: true
uncategorized:
version_checks:
diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb
index 04ad93fa13..1f81e43e2c 100644
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@@ -63,13 +63,14 @@ class Search
end
def self.blurb_for(cooked, term = nil, blurb_length = 200)
- cooked = SearchIndexer::HtmlScrubber.scrub(cooked).squish
-
blurb = nil
+ cooked = SearchIndexer.scrub_html_for_search(cooked)
+
if term
terms = term.split(/\s+/)
blurb = TextHelper.excerpt(cooked, terms.first, radius: blurb_length / 2, seperator: " ")
end
+
blurb = TextHelper.truncate(cooked, length: blurb_length, seperator: " ") if blurb.blank?
Sanitize.clean(blurb)
end
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index a9f309a489..3886eb025a 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -3,6 +3,10 @@ require 'rails_helper'
describe SearchIndexer do
let(:post_id) { 99 }
+ def scrub(html, strip_diacritics: false)
+ SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
+ end
+
it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
@@ -16,26 +20,26 @@ describe SearchIndexer do
it 'extract youtube title' do
html = ""
-
- scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
-
- expect(scrubbed).to eq(" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] ")
+ scrubbed = scrub(html)
+ expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
end
it 'extract a link' do
html = "link"
-
- scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
-
- expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
+ scrubbed = scrub(html)
+ expect(scrubbed).to eq("http://meta.discourse.org/ link")
end
- it 'removes diacritics' do
+ it 'uses ignore_accent setting to strip diacritics' do
html = "HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好
"
- scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true)
+ SiteSetting.search_ignore_accents = true
+ scrubbed = SearchIndexer.scrub_html_for_search(html)
+ expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
- expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
+ SiteSetting.search_ignore_accents = false
+ scrubbed = SearchIndexer.scrub_html_for_search(html)
+ expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
end
it "doesn't index local files" do
@@ -54,9 +58,9 @@ describe SearchIndexer do
HTML
- scrubbed = SearchIndexer::HtmlScrubber.scrub(html).gsub(/\s+/, " ")
+ scrubbed = scrub(html)
- expect(scrubbed).to eq(" Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB ")
+ expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")
end
it 'correctly indexes a post according to version' do