From 5b342ae50575f2ffc4029f5e612d2e8a3375ae00 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 12 Apr 2021 12:46:42 +1000 Subject: [PATCH] FIX: remove superfluous spaces from CJK blurbs (#12629) Previously we used the raw data indexed to generate blurbs even for cases when Chinese/Korean/Japanese text was used. This caused superfluous spaces to show up in excerpts. --- lib/search.rb | 7 ++++++- lib/search/grouped_search_results.rb | 2 +- spec/components/search_spec.rb | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/search.rb b/lib/search.rb index 2370877977..aa7b1cafbc 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -64,6 +64,11 @@ class Search end end + def self.segment_cjk? + ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) || + SiteSetting.search_tokenize_chinese_japanese_korean + end + def self.prepare_data(search_data, purpose = :query) purpose ||= :query @@ -73,7 +78,7 @@ class Search # TODO cppjieba_rb is designed for chinese, we need something else for Japanese # Korean appears to be safe cause words are already space seperated # For Japanese we should investigate using kakasi - if ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean + if segment_cjk? require 'cppjieba_rb' unless defined? CppjiebaRb mode = (purpose == :query ? :query : :mix) data = CppjiebaRb.segment(search_data, mode: mode) diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index 470a732e7d..8182fe90bc 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -87,7 +87,7 @@ class Search blurb_length: @blurb_length } - if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION + if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_cjk? if SiteSetting.use_pg_headlines_for_excerpt scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1') prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index be6e051e89..c6a52dcdd2 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -1791,6 +1791,27 @@ describe Search do end end + context 'CJK segmentation' do + before do + SiteSetting.search_tokenize_chinese_japanese_korean = true + SiteSetting.min_search_term_length = 1 + end + + let!(:post1) do + Fabricate(:post, raw: '場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠') + end + + it('does not include superflous spaces in blurbs') do + + results = Search.execute('ういかせ竹域', type_filter: 'topic') + expect(results.posts.length).to eq(1) + + expect(results.blurb(results.posts.first)).to include('ういかせ竹域') + + end + + end + context 'include_diacritics' do before { SiteSetting.search_ignore_accents = false } let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }