From e440ec2519d7dbd43d84f1d03334d1786e48a1f3 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Mon, 9 Dec 2019 17:43:51 +1100 Subject: [PATCH] FIX: crawler requests not tracked for non UTF-8 user agents Non UTF-8 user_agent requests were bypassing logging due to PG always wanting UTF-8 strings. This adds some conversion to ensure we are always dealing with UTF-8 --- lib/middleware/request_tracker.rb | 7 ++++++- .../middleware/request_tracker_spec.rb | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index 897a09a8fd..5f3a1f7458 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -117,7 +117,12 @@ class Middleware::RequestTracker } if h[:is_crawler] - h[:user_agent] = env['HTTP_USER_AGENT'] + user_agent = env['HTTP_USER_AGENT'] + if user_agent.encoding != Encoding::UTF_8 + user_agent = user_agent.encode("utf-8") + user_agent.scrub! + end + h[:user_agent] = user_agent end if cache = headers["X-Discourse-Cached"] diff --git a/spec/components/middleware/request_tracker_spec.rb b/spec/components/middleware/request_tracker_spec.rb index 8faeaf079e..d6c413cb9d 100644 --- a/spec/components/middleware/request_tracker_spec.rb +++ b/spec/components/middleware/request_tracker_spec.rb @@ -15,6 +15,26 @@ describe Middleware::RequestTracker do }.merge(opts) end + context "full request" do + before do + @orig = WebCrawlerRequest.autoflush + WebCrawlerRequest.autoflush = 1 + end + after do + WebCrawlerRequest.autoflush = @orig + end + + it "can handle rogue user agents" do + agent = (+"Evil Googlebot String \xc3\x28").force_encoding("Windows-1252") + + middleware = Middleware::RequestTracker.new(->(env) { ["200", { "Content-Type" => "text/html" }, [""]] }) + middleware.call(env("HTTP_USER_AGENT" => agent)) + + expect(WebCrawlerRequest.where(user_agent: agent.encode('utf-8')).count).to eq(1) + end + + end + context "log_request" do before do freeze_time Time.now