From 820fea835cd949fd67b9df808ff3dec964023218 Mon Sep 17 00:00:00 2001 From: Dan Ungureanu Date: Mon, 7 Feb 2022 23:03:01 +0200 Subject: [PATCH] FIX: Further reduce the input of to_tsvector (#15716) Random strings can result into much longer tsvectors. For example parsing a Base64 string of ~600kb can result in a tsvector of over 1MB, which is the maximum size of a tsvector. Follow-up-to: 823c3f09d44ab89e88c4910abe36899bb23d601d --- app/services/search_indexer.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index e774ad4df6..36a846fdf2 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -120,11 +120,11 @@ class SearchIndexer a_weight: topic_title, b_weight: category_name, c_weight: topic_tags, - # Length of a tsvector must be less than 1_048_576 bytes. - # The difference between the max ouptut limit and imposed input limit - # accounts for the fact that sometimes the output tsvector may be - # slighlty longer than the input. - d_weight: scrub_html_for_search(cooked)[0..1_000_000] + # The tsvector resulted from parsing a string can be double the size of + # the original string. Since there is no way to estimate the length of + # the expected tsvector, we limit the input to ~50% of the maximum + # length of a tsvector (1_048_576 bytes). + d_weight: scrub_html_for_search(cooked)[0..600_000] ) do |params| params["private_message"] = private_message end