diff --git a/Gemfile b/Gemfile index d8acd785c5..8e8ee3924a 100644 --- a/Gemfile +++ b/Gemfile @@ -180,7 +180,7 @@ gem 'rqrcode' gem 'sshkey', require: false -gem 'charlock_holmes', require: false +gem 'rchardet', require: false if ENV["IMPORT"] == "1" gem 'mysql2' diff --git a/Gemfile.lock b/Gemfile.lock index 902d22f085..ab3362beca 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -75,7 +75,6 @@ GEM uniform_notifier (~> 1.11.0) byebug (10.0.2) certified (1.0.0) - charlock_holmes (0.7.6) chunky_png (1.3.10) claide (1.0.2) claide-plugins (0.9.2) @@ -321,6 +320,7 @@ GEM ffi (>= 1.0.6) msgpack (>= 0.4.3) trollop (>= 1.16.2) + rchardet (1.8.0) redis (4.0.1) redis-namespace (1.6.0) redis (>= 3.0.4) @@ -457,7 +457,6 @@ DEPENDENCIES bullet byebug certified - charlock_holmes cppjieba_rb danger discourse_image_optim @@ -523,6 +522,7 @@ DEPENDENCIES rb-fsevent rb-inotify (~> 0.9) rbtrace + rchardet redis redis-namespace rinku diff --git a/app/jobs/scheduled/poll_feed.rb b/app/jobs/scheduled/poll_feed.rb index b044b9dfa4..098615bb67 100644 --- a/app/jobs/scheduled/poll_feed.rb +++ b/app/jobs/scheduled/poll_feed.rb @@ -90,7 +90,7 @@ module Jobs def parsed_feed raw_feed, encoding = fetch_rss encoded_feed = Encodings.try_utf8(raw_feed, encoding) if encoding - encoded_feed = Encodings.to_utf8(raw_feed, encoding_hint: encoding) unless encoded_feed + encoded_feed = Encodings.to_utf8(raw_feed) unless encoded_feed return nil if encoded_feed.blank? diff --git a/lib/encodings.rb b/lib/encodings.rb index b9a8a876ea..f323deff07 100644 --- a/lib/encodings.rb +++ b/lib/encodings.rb @@ -1,20 +1,12 @@ -require 'charlock_holmes' +require 'rchardet' module Encodings - BINARY_SCAN_LENGTH = 0 + def self.to_utf8(string) + result = CharDet.detect(string) - def self.to_utf8(string, encoding_hint: nil, delete_bom: true) - detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH) - result = detector.detect(string, encoding_hint&.to_s) - - if result && result[:encoding] - string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name) - else - string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '') - end - - delete_bom!(string) if delete_bom - string + encoded_string = try_utf8(string, result['encoding']) if result && result['encoding'] + encoded_string = force_utf8(string) if encoded_string.nil? + encoded_string end def self.try_utf8(string, source_encoding) @@ -26,6 +18,14 @@ module Encodings nil end + def self.force_utf8(string) + encoded_string = string.encode(Encoding::UTF_8, + undef: :replace, + invalid: :replace, + replace: '') + delete_bom!(encoded_string) + end + def self.delete_bom!(string) string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank? string diff --git a/spec/lib/encodings_spec.rb b/spec/lib/encodings_spec.rb index 2d15ab5a00..3e63fd3855 100644 --- a/spec/lib/encodings_spec.rb +++ b/spec/lib/encodings_spec.rb @@ -1,9 +1,9 @@ require 'rails_helper' describe Encodings do - def to_utf8(filename, encoding_hint = nil) + def to_utf8(filename) string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp - Encodings.to_utf8(string, encoding_hint: encoding_hint) + Encodings.to_utf8(string) end context "unicode" do