From 2f8f2aa1dd3cfe02a4a5b90f311ccd9b60ae400e Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Fri, 21 Jul 2017 15:29:04 -0400 Subject: [PATCH] FEATURE: Whitelists for inline oneboxing --- app/jobs/regular/crawl_topic_link.rb | 56 ++------------------ config/locales/server.en.yml | 1 + config/site_settings.yml | 3 ++ lib/inline_oneboxer.rb | 54 +++++++++++++------ lib/retrieve_title.rb | 70 +++++++++++++++++++++++++ spec/components/inline_oneboxer_spec.rb | 37 +++++++++---- spec/components/retrieve_title_spec.rb | 40 ++++++++++++++ 7 files changed, 185 insertions(+), 76 deletions(-) create mode 100644 lib/retrieve_title.rb create mode 100644 spec/components/retrieve_title_spec.rb diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb index c13a6e7b03..b5077aff85 100644 --- a/app/jobs/regular/crawl_topic_link.rb +++ b/app/jobs/regular/crawl_topic_link.rb @@ -1,50 +1,11 @@ require 'open-uri' require 'nokogiri' require 'excon' -require 'final_destination' +require_dependency 'retrieve_title' module Jobs class CrawlTopicLink < Jobs::Base - class ReadEnough < StandardError; end - - def self.max_chunk_size(uri) - # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for - # one host but amazon is a big one. - return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ - - # Default is 10k - 10 - end - - # Fetch the beginning of a HTML document at a url - def self.fetch_beginning(url) - # Never crawl in test mode - return if Rails.env.test? - - fd = FinalDestination.new(url) - uri = fd.resolve - return "" unless uri - - result = "" - streamer = lambda do |chunk, _, _| - result << chunk - - # Using exceptions for flow control is really bad, but there really seems to - # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for - # that matter!) - raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024) - end - Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers) - result - - rescue Excon::Errors::SocketError => ex - return result if ex.socket_error.is_a?(ReadEnough) - raise - rescue ReadEnough - result - end - def execute(args) raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present? @@ -72,18 +33,9 @@ module Jobs unless crawled # Fetch the beginning of the document to find the title - result = CrawlTopicLink.fetch_beginning(topic_link.url) - doc = Nokogiri::HTML(result) - if doc - title = doc.at('title').try(:inner_text) - if title.present? - title.gsub!(/\n/, ' ') - title.gsub!(/ +/, ' ') - title.strip! - if title.present? - crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1) - end - end + title = RetrieveTitle.crawl(topic_link.url) + if title.present? + crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1) end end rescue Exception diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 65fd41b488..d46abb6573 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -979,6 +979,7 @@ en: show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view." post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters." onebox_domains_blacklist: "A list of domains that will never be oneboxed." + inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title" max_oneboxes_per_post: "Maximum number of oneboxes in a post." logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown." diff --git a/config/site_settings.yml b/config/site_settings.yml index 7bef397bdd..c270144629 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -920,6 +920,9 @@ onebox: max_oneboxes_per_post: default: 50 client: true + inline_onebox_domains_whitelist: + default: '' + type: list spam: add_rel_nofollow_to_user_content: true diff --git a/lib/inline_oneboxer.rb b/lib/inline_oneboxer.rb index 0f3ec89be9..b064794882 100644 --- a/lib/inline_oneboxer.rb +++ b/lib/inline_oneboxer.rb @@ -1,38 +1,50 @@ +require_dependency 'retrieve_title' + class InlineOneboxer - def initialize(urls) + def initialize(urls, opts=nil) @urls = urls + @opts = opts || {} end def process - @urls.map {|url| InlineOneboxer.lookup(url) }.compact + @urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact end - def self.clear_cache! + def self.purge(url) + Rails.cache.delete(cache_key(url)) end def self.cache_lookup(url) Rails.cache.read(cache_key(url)) end - def self.lookup(url) - cached = cache_lookup(url) - return cached if cached.present? + def self.lookup(url, opts=nil) + opts ||= {} + + unless opts[:skip_cache] + cached = cache_lookup(url) + return cached if cached.present? + end if route = Discourse.route_for(url) if route[:controller] == "topics" && route[:action] == "show" && topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil) - # Only public topics - if Guardian.new.can_see?(topic) - onebox = { - url: url, - title: Emoji.gsub_emoji_to_unicode(topic.title) - } - Rails.cache.write(cache_key(url), onebox, expires_in: 1.day) - return onebox - end + return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic) + end + end + + if whitelist = SiteSetting.inline_onebox_domains_whitelist + uri = URI(url) rescue nil + + domains = whitelist.split('|') + if uri.present? && + uri.hostname.present? && + domains.include?(uri.hostname) && + title = RetrieveTitle.crawl(url) + return onebox_for(url, title, opts) end end @@ -41,6 +53,18 @@ class InlineOneboxer private + def self.onebox_for(url, title, opts) + onebox = { + url: url, + title: Emoji.gsub_emoji_to_unicode(title) + } + unless opts[:skip_cache] + Rails.cache.write(cache_key(url), onebox, expires_in: 1.day) + end + + onebox + end + def self.cache_key(url) "inline_onebox:#{url}" end diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb new file mode 100644 index 0000000000..62d5498c15 --- /dev/null +++ b/lib/retrieve_title.rb @@ -0,0 +1,70 @@ +require_dependency 'final_destination' + +module RetrieveTitle + class ReadEnough < StandardError; end + + def self.crawl(url) + extract_title(fetch_beginning(url)) + rescue Exception + # If there was a connection error, do nothing + end + + def self.extract_title(html) + title = nil + if doc = Nokogiri::HTML(html) + + if node = doc.at('meta[property="og:title"]') + title = node['content'] + end + + title ||= doc.at('title')&.inner_text + end + + if title.present? + title.gsub!(/\n/, ' ') + title.gsub!(/ +/, ' ') + title.strip! + return title + end + nil + end + + private + + def self.max_chunk_size(uri) + # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for + # one host but amazon is a big one. + return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ + + # default is 10k + 10 + end + + # Fetch the beginning of a HTML document at a url + def self.fetch_beginning(url) + # Never crawl in test mode + return if Rails.env.test? + + fd = FinalDestination.new(url) + uri = fd.resolve + return "" unless uri + + result = "" + streamer = lambda do |chunk, _, _| + result << chunk + + # Using exceptions for flow control is really bad, but there really seems to + # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for + # that matter!) + raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024) + end + Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers) + result + + rescue Excon::Errors::SocketError => ex + return result if ex.socket_error.is_a?(ReadEnough) + raise + rescue ReadEnough + result + end +end diff --git a/spec/components/inline_oneboxer_spec.rb b/spec/components/inline_oneboxer_spec.rb index b8577b0d9d..8f68f58c3e 100644 --- a/spec/components/inline_oneboxer_spec.rb +++ b/spec/components/inline_oneboxer_spec.rb @@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer' describe InlineOneboxer do - before do - InlineOneboxer.clear_cache! - end - it "should return nothing with empty input" do expect(InlineOneboxer.new([]).process).to be_blank end it "can onebox a topic" do topic = Fabricate(:topic) - results = InlineOneboxer.new([topic.url]).process + results = InlineOneboxer.new([topic.url], skip_cache: true).process expect(results).to be_present expect(results[0][:url]).to eq(topic.url) expect(results[0][:title]).to eq(topic.title) @@ -21,13 +17,18 @@ describe InlineOneboxer do it "doesn't onebox private messages" do topic = Fabricate(:private_message_topic) - results = InlineOneboxer.new([topic.url]).process + results = InlineOneboxer.new([topic.url], skip_cache: true).process expect(results).to be_blank end context "caching" do + let(:topic) { Fabricate(:topic) } + + before do + InlineOneboxer.purge(topic.url) + end + it "puts an entry in the cache" do - topic = Fabricate(:topic) expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank result = InlineOneboxer.lookup(topic.url) @@ -43,7 +44,7 @@ describe InlineOneboxer do context ".lookup" do it "can lookup one link at a time" do topic = Fabricate(:topic) - onebox = InlineOneboxer.lookup(topic.url) + onebox = InlineOneboxer.lookup(topic.url, skip_cache: true) expect(onebox).to be_present expect(onebox[:url]).to eq(topic.url) expect(onebox[:title]).to eq(topic.title) @@ -56,12 +57,30 @@ describe InlineOneboxer do it "will return the fancy title" do topic = Fabricate(:topic, title: "Hello :pizza: with an emoji") - onebox = InlineOneboxer.lookup(topic.url) + onebox = InlineOneboxer.lookup(topic.url, skip_cache: true) expect(onebox).to be_present expect(onebox[:url]).to eq(topic.url) expect(onebox[:title]).to eq("Hello 🍕 with an emoji") end + it "will not crawl domains that aren't whitelisted" do + onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true) + expect(onebox).to be_blank + end + + it "will lookup whitelisted domains" do + SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com" + RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog") + + onebox = InlineOneboxer.lookup( + "https://eviltrout.com/some-path", + skip_cache: true + ) + expect(onebox).to be_present + expect(onebox[:url]).to eq("https://eviltrout.com/some-path") + expect(onebox[:title]).to eq("Evil Trout's Blog") + end + end diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb new file mode 100644 index 0000000000..8ec078855e --- /dev/null +++ b/spec/components/retrieve_title_spec.rb @@ -0,0 +1,40 @@ +require 'rails_helper' +require_dependency 'retrieve_title' + +describe RetrieveTitle do + + context "extract_title" do + + it "will extract the value from the title tag" do + title = RetrieveTitle.extract_title( + "My Cool Title" + ) + + expect(title).to eq("My Cool Title") + end + + it "will strip whitespace" do + title = RetrieveTitle.extract_title( + " Another Title\n\n " + ) + + expect(title).to eq("Another Title") + end + + it "will prefer the title from an opengraph tag" do + title = RetrieveTitle.extract_title(<<~HTML + + Bad Title + + + HTML + ) + + expect(title).to eq("Good Title") + end + + end + +end + +