diff --git a/script/bulk_import/base.rb b/script/bulk_import/base.rb
index 57a7eac029..a9188f6b17 100644
--- a/script/bulk_import/base.rb
+++ b/script/bulk_import/base.rb
@@ -58,6 +58,7 @@ class BulkImport::Base
db = ActiveRecord::Base.connection_config
@encoder = PG::TextEncoder::CopyRow.new
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
+ # @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port], password: "discourse")
@uploader = ImportScripts::Uploader.new
@html_entities = HTMLEntities.new
@encoding = CHARSET_MAP[charset]
@@ -580,13 +581,18 @@ class BulkImport::Base
@raw_connection.copy_data(sql, @encoder) do
rows.each do |row|
- mapped = yield(row)
- next unless mapped
- processed = send(process_method_name, mapped)
- imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
- imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
- @raw_connection.put_copy_data columns.map { |c| processed[c] }
- print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
+ begin
+ mapped = yield(row)
+ next unless mapped
+ processed = send(process_method_name, mapped)
+ imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
+ imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
+ @raw_connection.put_copy_data columns.map { |c| processed[c] }
+ print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
+ rescue => e
+ puts "\n"
+ puts "ERROR: #{e.inspect}"
+ end
end
end
@@ -624,6 +630,10 @@ class BulkImport::Base
@uploader.create_upload(user_id, path, source_filename)
end
+ def html_for_upload(upload, display_filename)
+ @uploader.html_for_upload(upload, display_filename)
+ end
+
def fix_name(name)
name.scrub! if name.valid_encoding? == false
return if name.blank?
diff --git a/script/bulk_import/vanilla.rb b/script/bulk_import/vanilla.rb
index 584b995619..ff24daffc5 100644
--- a/script/bulk_import/vanilla.rb
+++ b/script/bulk_import/vanilla.rb
@@ -18,7 +18,9 @@ class BulkImport::Vanilla < BulkImport::Base
@client = Mysql2::Client.new(
host: "localhost",
username: "root",
- database: VANILLA_DB
+ database: VANILLA_DB,
+ password: "",
+ reconnect: true
)
@import_tags = false
@@ -42,6 +44,7 @@ class BulkImport::Vanilla < BulkImport::Base
import_avatars # slow
create_permalinks # TODO: do it bulk style
+ import_attachments # slow
end
def execute
@@ -54,8 +57,14 @@ class BulkImport::Vanilla < BulkImport::Base
# other good ones:
# SiteSetting.port = 3000
+ # SiteSetting.permalink_normalizations = "/discussion\/(\d+)\/.*/discussion/\1"
# SiteSetting.automatic_backups_enabled = false
# SiteSetting.disable_emails = "non-staff"
+ # SiteSetting.authorized_extensions = '*'
+ # SiteSetting.max_image_size_kb = 102400
+ # SiteSetting.max_attachment_size_kb = 102400
+ # SiteSetting.clean_up_uploads = false
+ # SiteSetting.clean_orphan_uploads_grace_period_hours = 43200
# etc.
import_users
@@ -250,6 +259,86 @@ class BulkImport::Vanilla < BulkImport::Base
end
end
+ def import_attachments
+ if ATTACHMENTS_BASE_DIR && File.exists?(ATTACHMENTS_BASE_DIR)
+ puts "", "importing attachments"
+
+ start = Time.now
+ count = 0
+
+ # https://us.v-cdn.net/1234567/uploads/editor/xyz/image.jpg
+ cdn_regex = /https:\/\/us.v-cdn.net\/1234567\/uploads\/(\S+\/(\w|-)+.\w+)/i
+ # [attachment=10109:Screen Shot 2012-04-01 at 3.47.35 AM.png]
+ attachment_regex = /\[attachment=(\d+):(.*?)\]/i
+
+ Post.where("raw LIKE '%/us.v-cdn.net/%' OR raw LIKE '%[attachment%'").find_each do |post|
+ count += 1
+ print "\r%7d - %6d/sec".freeze % [count, count.to_f / (Time.now - start)]
+ new_raw = post.raw.dup
+
+ new_raw.gsub!(attachment_regex) do |s|
+ matches = attachment_regex.match(s)
+ attachment_id = matches[1]
+ file_name = matches[2]
+ next unless attachment_id
+
+ r = mysql_query("SELECT Path, Name FROM #{TABLE_PREFIX}Media WHERE MediaID = #{attachment_id};").first
+ next if r.nil?
+ path = r["Path"]
+ name = r["Name"]
+ next unless path.present?
+
+ path.gsub!("s3://content/", "")
+ path.gsub!("s3://uploads/", "")
+ file_path = "#{ATTACHMENTS_BASE_DIR}/#{path}"
+
+ if File.exists?(file_path)
+ upload = create_upload(post.user.id, file_path, File.basename(file_path))
+ if upload && upload.errors.empty?
+ # upload.url
+ filename = name || file_name || File.basename(file_path)
+ html_for_upload(upload, normalize_text(filename))
+ else
+ puts "Error: Upload did not persist for #{post.id} #{attachment_id}!"
+ end
+ else
+ puts "Couldn't find file for #{attachment_id}. Skipping."
+ next
+ end
+ end
+
+ new_raw.gsub!(cdn_regex) do |s|
+ matches = cdn_regex.match(s)
+ attachment_id = matches[1]
+
+ file_path = "#{ATTACHMENTS_BASE_DIR}/#{attachment_id}"
+
+ if File.exists?(file_path)
+ upload = create_upload(post.user.id, file_path, File.basename(file_path))
+ if upload && upload.errors.empty?
+ upload.url
+ else
+ puts "Error: Upload did not persist for #{post.id} #{attachment_id}!"
+ end
+ else
+ puts "Couldn't find file for #{attachment_id}. Skipping."
+ next
+ end
+ end
+
+ if new_raw != post.raw
+ begin
+ PostRevisor.new(post).revise!(post.user, { raw: new_raw }, skip_revision: true, skip_validations: true, bypass_bump: true)
+ rescue
+ puts "PostRevisor error for #{post.id}"
+ post.raw = new_raw
+ post.save(validate: false)
+ end
+ end
+ end
+ end
+ end
+
def find_photo_file(path, base_filename)
base_guess = base_filename.dup
full_guess = File.join(path, base_guess) # often an exact match exists
@@ -538,16 +627,18 @@ class BulkImport::Vanilla < BulkImport::Base
pcf = post.custom_fields
if pcf && pcf["import_id"]
topic = post.topic
- id = pcf["import_id"].split('-').last
- if post.post_number == 1
- slug = Slug.for(topic.title) # probably matches what vanilla would do...
- @raw_connection.put_copy_data(
- ["discussion/#{id}/#{slug}", topic.id, nil, now, now]
- )
- else
- @raw_connection.put_copy_data(
- ["discussion/comment/#{id}", nil, post.id, now, now]
- )
+ if topic.present?
+ id = pcf["import_id"].split('-').last
+ if post.post_number == 1
+ slug = Slug.for(topic.title) # probably matches what vanilla would do...
+ @raw_connection.put_copy_data(
+ ["discussion/#{id}/#{slug}", topic.id, nil, now, now]
+ )
+ else
+ @raw_connection.put_copy_data(
+ ["discussion/comment/#{id}", nil, post.id, now, now]
+ )
+ end
end
end
@@ -559,10 +650,46 @@ class BulkImport::Vanilla < BulkImport::Base
def clean_up(raw)
# post id is sometimes prefixed with "c-"
raw.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
+ raw = raw.delete("\u0000")
+ raw = process_raw_text(raw)
raw
end
+ def process_raw_text(raw)
+ return "" if raw.blank?
+ text = raw.dup
+ text = CGI.unescapeHTML(text)
+
+ text.gsub!(/:(?:\w{8})\]/, ']')
+
+ # Some links look like this: http://www.onegameamonth.com
+ text.gsub!(/(.+)<\/a>/i, '[\2](\1)')
+
+ # phpBB shortens link text like this, which breaks our markdown processing:
+ # [http://answers.yahoo.com/question/index ... 223AAkkPli](http://answers.yahoo.com/question/index?qid=20070920134223AAkkPli)
+ #
+ # Work around it for now:
+ text.gsub!(/\[http(s)?:\/\/(www\.)?/i, '[')
+
+ # convert list tags to ul and list=1 tags to ol
+ # list=a is not supported, so handle it like list=1
+ # list=9 and list=x have the same result as list=1 and list=a
+ text.gsub!(/\[list\](.*?)\[\/list:u\]/mi, '[ul]\1[/ul]')
+ text.gsub!(/\[list=.*?\](.*?)\[\/list:o\]/mi, '[ol]\1[/ol]')
+
+ # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists:
+ text.gsub!(/\[\*\](.*?)\[\/\*:m\]/mi, '[li]\1[/li]')
+
+ # [QUOTE=""] -- add newline
+ text.gsub!(/(\[quote="[a-zA-Z\d]+"\])/i) { "#{$1}\n" }
+
+ # [/QUOTE] -- add newline
+ text.gsub!(/(\[\/quote\])/i) { "\n#{$1}" }
+
+ text
+ end
+
def staff_guardian
@_staff_guardian ||= Guardian.new(Discourse.system_user)
end