From 2f8f2aa1dd3cfe02a4a5b90f311ccd9b60ae400e Mon Sep 17 00:00:00 2001
From: Robin Ward <robin.ward@gmail.com>
Date: Fri, 21 Jul 2017 15:29:04 -0400
Subject: [PATCH] FEATURE: Whitelists for inline oneboxing

---
 app/jobs/regular/crawl_topic_link.rb    | 56 ++------------------
 config/locales/server.en.yml            |  1 +
 config/site_settings.yml                |  3 ++
 lib/inline_oneboxer.rb                  | 54 +++++++++++++------
 lib/retrieve_title.rb                   | 70 +++++++++++++++++++++++++
 spec/components/inline_oneboxer_spec.rb | 37 +++++++++----
 spec/components/retrieve_title_spec.rb  | 40 ++++++++++++++
 7 files changed, 185 insertions(+), 76 deletions(-)
 create mode 100644 lib/retrieve_title.rb
 create mode 100644 spec/components/retrieve_title_spec.rb

diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb
index c13a6e7b03..b5077aff85 100644
--- a/app/jobs/regular/crawl_topic_link.rb
+++ b/app/jobs/regular/crawl_topic_link.rb
@@ -1,50 +1,11 @@
 require 'open-uri'
 require 'nokogiri'
 require 'excon'
-require 'final_destination'
+require_dependency 'retrieve_title'
 
 module Jobs
   class CrawlTopicLink < Jobs::Base
 
-    class ReadEnough < StandardError; end
-
-    def self.max_chunk_size(uri)
-      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
-      # one host but amazon is a big one.
-      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
-
-      # Default is 10k
-      10
-    end
-
-    # Fetch the beginning of a HTML document at a url
-    def self.fetch_beginning(url)
-      # Never crawl in test mode
-      return if Rails.env.test?
-
-      fd = FinalDestination.new(url)
-      uri = fd.resolve
-      return "" unless uri
-
-      result = ""
-      streamer = lambda do |chunk, _, _|
-        result << chunk
-
-        # Using exceptions for flow control is really bad, but there really seems to
-        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
-        # that matter!)
-        raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
-      end
-      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
-      result
-
-    rescue Excon::Errors::SocketError => ex
-      return result if ex.socket_error.is_a?(ReadEnough)
-      raise
-    rescue ReadEnough
-      result
-    end
-
     def execute(args)
       raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
 
@@ -72,18 +33,9 @@ module Jobs
 
         unless crawled
           # Fetch the beginning of the document to find the title
-          result = CrawlTopicLink.fetch_beginning(topic_link.url)
-          doc = Nokogiri::HTML(result)
-          if doc
-            title = doc.at('title').try(:inner_text)
-            if title.present?
-              title.gsub!(/\n/, ' ')
-              title.gsub!(/ +/, ' ')
-              title.strip!
-              if title.present?
-                crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
-              end
-            end
+          title = RetrieveTitle.crawl(topic_link.url)
+          if title.present?
+            crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
           end
         end
       rescue Exception
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index 65fd41b488..d46abb6573 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -979,6 +979,7 @@ en:
     show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
     post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
     onebox_domains_blacklist: "A list of domains that will never be oneboxed."
+    inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title"
     max_oneboxes_per_post: "Maximum number of oneboxes in a post."
 
     logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 7bef397bdd..c270144629 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -920,6 +920,9 @@ onebox:
   max_oneboxes_per_post:
     default: 50
     client: true
+  inline_onebox_domains_whitelist:
+    default: ''
+    type: list
 
 spam:
   add_rel_nofollow_to_user_content: true
diff --git a/lib/inline_oneboxer.rb b/lib/inline_oneboxer.rb
index 0f3ec89be9..b064794882 100644
--- a/lib/inline_oneboxer.rb
+++ b/lib/inline_oneboxer.rb
@@ -1,38 +1,50 @@
+require_dependency 'retrieve_title'
+
 class InlineOneboxer
 
-  def initialize(urls)
+  def initialize(urls, opts=nil)
     @urls = urls
+    @opts = opts || {}
   end
 
   def process
-    @urls.map {|url| InlineOneboxer.lookup(url) }.compact
+    @urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
   end
 
-  def self.clear_cache!
+  def self.purge(url)
+    Rails.cache.delete(cache_key(url))
   end
 
   def self.cache_lookup(url)
     Rails.cache.read(cache_key(url))
   end
 
-  def self.lookup(url)
-    cached = cache_lookup(url)
-    return cached if cached.present?
+  def self.lookup(url, opts=nil)
+    opts ||= {}
+
+    unless opts[:skip_cache]
+      cached = cache_lookup(url)
+      return cached if cached.present?
+    end
 
     if route = Discourse.route_for(url)
       if route[:controller] == "topics" &&
         route[:action] == "show" &&
         topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
 
-        # Only public topics
-        if Guardian.new.can_see?(topic)
-          onebox = {
-            url: url,
-            title: Emoji.gsub_emoji_to_unicode(topic.title)
-          }
-          Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
-          return onebox
-        end
+        return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
+      end
+    end
+
+    if whitelist = SiteSetting.inline_onebox_domains_whitelist
+      uri = URI(url) rescue nil
+
+      domains = whitelist.split('|')
+      if uri.present? &&
+        uri.hostname.present? &&
+        domains.include?(uri.hostname) &&
+        title = RetrieveTitle.crawl(url)
+          return onebox_for(url, title, opts)
       end
     end
 
@@ -41,6 +53,18 @@ class InlineOneboxer
 
   private
 
+    def self.onebox_for(url, title, opts)
+      onebox = {
+        url: url,
+        title: Emoji.gsub_emoji_to_unicode(title)
+      }
+      unless opts[:skip_cache]
+        Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
+      end
+
+      onebox
+    end
+
     def self.cache_key(url)
       "inline_onebox:#{url}"
     end
diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
new file mode 100644
index 0000000000..62d5498c15
--- /dev/null
+++ b/lib/retrieve_title.rb
@@ -0,0 +1,70 @@
+require_dependency 'final_destination'
+
+module RetrieveTitle
+  class ReadEnough < StandardError; end
+
+  def self.crawl(url)
+    extract_title(fetch_beginning(url))
+  rescue Exception
+    # If there was a connection error, do nothing
+  end
+
+  def self.extract_title(html)
+    title = nil
+    if doc = Nokogiri::HTML(html)
+
+      if node = doc.at('meta[property="og:title"]')
+        title = node['content']
+      end
+
+      title ||= doc.at('title')&.inner_text
+    end
+
+    if title.present?
+      title.gsub!(/\n/, ' ')
+      title.gsub!(/ +/, ' ')
+      title.strip!
+      return title
+    end
+    nil
+  end
+
+  private
+
+    def self.max_chunk_size(uri)
+      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
+      # one host but amazon is a big one.
+      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
+
+      # default is 10k
+      10
+    end
+
+    # Fetch the beginning of a HTML document at a url
+    def self.fetch_beginning(url)
+      # Never crawl in test mode
+      return if Rails.env.test?
+
+      fd = FinalDestination.new(url)
+      uri = fd.resolve
+      return "" unless uri
+
+      result = ""
+      streamer = lambda do |chunk, _, _|
+        result << chunk
+
+        # Using exceptions for flow control is really bad, but there really seems to
+        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
+        # that matter!)
+        raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
+      end
+      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
+      result
+
+    rescue Excon::Errors::SocketError => ex
+      return result if ex.socket_error.is_a?(ReadEnough)
+      raise
+    rescue ReadEnough
+      result
+    end
+end
diff --git a/spec/components/inline_oneboxer_spec.rb b/spec/components/inline_oneboxer_spec.rb
index b8577b0d9d..8f68f58c3e 100644
--- a/spec/components/inline_oneboxer_spec.rb
+++ b/spec/components/inline_oneboxer_spec.rb
@@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer'
 
 describe InlineOneboxer do
 
-  before do
-    InlineOneboxer.clear_cache!
-  end
-
   it "should return nothing with empty input" do
     expect(InlineOneboxer.new([]).process).to be_blank
   end
 
   it "can onebox a topic" do
     topic = Fabricate(:topic)
-    results = InlineOneboxer.new([topic.url]).process
+    results = InlineOneboxer.new([topic.url], skip_cache: true).process
     expect(results).to be_present
     expect(results[0][:url]).to eq(topic.url)
     expect(results[0][:title]).to eq(topic.title)
@@ -21,13 +17,18 @@ describe InlineOneboxer do
 
   it "doesn't onebox private messages" do
     topic = Fabricate(:private_message_topic)
-    results = InlineOneboxer.new([topic.url]).process
+    results = InlineOneboxer.new([topic.url], skip_cache: true).process
     expect(results).to be_blank
   end
 
   context "caching" do
+    let(:topic) { Fabricate(:topic) }
+
+    before do
+      InlineOneboxer.purge(topic.url)
+    end
+
     it "puts an entry in the cache" do
-      topic = Fabricate(:topic)
       expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank
 
       result = InlineOneboxer.lookup(topic.url)
@@ -43,7 +44,7 @@ describe InlineOneboxer do
   context ".lookup" do
     it "can lookup one link at a time" do
       topic = Fabricate(:topic)
-      onebox = InlineOneboxer.lookup(topic.url)
+      onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
       expect(onebox).to be_present
       expect(onebox[:url]).to eq(topic.url)
       expect(onebox[:title]).to eq(topic.title)
@@ -56,12 +57,30 @@ describe InlineOneboxer do
 
     it "will return the fancy title" do
       topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
-      onebox = InlineOneboxer.lookup(topic.url)
+      onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
       expect(onebox).to be_present
       expect(onebox[:url]).to eq(topic.url)
       expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
     end
 
+    it "will not crawl domains that aren't whitelisted" do
+      onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true)
+      expect(onebox).to be_blank
+    end
+
+    it "will lookup whitelisted domains" do
+      SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com"
+      RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog")
+
+      onebox = InlineOneboxer.lookup(
+        "https://eviltrout.com/some-path",
+        skip_cache: true
+      )
+      expect(onebox).to be_present
+      expect(onebox[:url]).to eq("https://eviltrout.com/some-path")
+      expect(onebox[:title]).to eq("Evil Trout's Blog")
+    end
+
   end
 
 
diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb
new file mode 100644
index 0000000000..8ec078855e
--- /dev/null
+++ b/spec/components/retrieve_title_spec.rb
@@ -0,0 +1,40 @@
+require 'rails_helper'
+require_dependency 'retrieve_title'
+
+describe RetrieveTitle do
+
+  context "extract_title" do
+
+    it "will extract the value from the title tag" do
+      title = RetrieveTitle.extract_title(
+        "<html><title>My Cool Title</title></html>"
+      )
+
+      expect(title).to eq("My Cool Title")
+    end
+
+    it "will strip whitespace" do
+      title = RetrieveTitle.extract_title(
+        "<html><title>   Another Title\n\n </title></html>"
+      )
+
+      expect(title).to eq("Another Title")
+    end
+
+    it "will prefer the title from an opengraph tag" do
+      title = RetrieveTitle.extract_title(<<~HTML
+        <html>
+          <title>Bad Title</title>
+          <meta property="og:title" content="Good Title" />
+        </html>
+        HTML
+      )
+
+      expect(title).to eq("Good Title")
+    end
+
+  end
+
+end
+
+