From 201d344a2d86878b5b0e446e3bfce487b0eb5cea Mon Sep 17 00:00:00 2001 From: Arpit Jalan Date: Sun, 10 Jul 2016 14:49:24 +0530 Subject: [PATCH] improve vBulletin import script --- script/import_scripts/vbulletin.rb | 303 +++++++++++++++++++++++------ 1 file changed, 248 insertions(+), 55 deletions(-) diff --git a/script/import_scripts/vbulletin.rb b/script/import_scripts/vbulletin.rb index 588e1ddff1..5a7303d12c 100644 --- a/script/import_scripts/vbulletin.rb +++ b/script/import_scripts/vbulletin.rb @@ -1,13 +1,15 @@ require 'mysql2' require File.expand_path(File.dirname(__FILE__) + "/base.rb") require 'htmlentities' +require 'php_serialize' # https://github.com/jqr/php-serialize class ImportScripts::VBulletin < ImportScripts::Base BATCH_SIZE = 1000 # CHANGE THESE BEFORE RUNNING THE IMPORTER - DATABASE = "iref" - TIMEZONE = "Asia/Kolkata" + DATABASE = "q23" + TABLE_PREFIX = "vb_" + TIMEZONE = "America/Los_Angeles" ATTACHMENT_DIR = '/path/to/your/attachment/folder' def initialize @@ -32,10 +34,14 @@ class ImportScripts::VBulletin < ImportScripts::Base import_categories import_topics import_posts + import_private_messages import_attachments close_topics post_process_posts + + create_permalinks + suspend_users end def import_groups @@ -43,7 +49,7 @@ class ImportScripts::VBulletin < ImportScripts::Base groups = mysql_query <<-SQL SELECT usergroupid, title - FROM usergroup + FROM #{TABLE_PREFIX}usergroup ORDER BY usergroupid SQL @@ -58,12 +64,12 @@ class ImportScripts::VBulletin < ImportScripts::Base def import_users puts "", "importing users" - user_count = mysql_query("SELECT COUNT(userid) count FROM user").first["count"] + user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"] batches(BATCH_SIZE) do |offset| users = mysql_query <<-SQL SELECT userid, username, homepage, usertitle, usergroupid, joindate, email - FROM user + FROM #{TABLE_PREFIX}user ORDER BY userid LIMIT #{BATCH_SIZE} OFFSET #{offset} @@ -85,6 +91,7 @@ class ImportScripts::VBulletin < ImportScripts::Base title: @htmlentities.decode(user["usertitle"]).strip, primary_group_id: group_id_from_imported_group_id(user["usergroupid"]), created_at: parse_timestamp(user["joindate"]), + last_seen_at: parse_timestamp(user["lastvisit"]), post_create_action: proc do |u| @old_username_to_new_usernames[user["username"]] = u.username import_profile_picture(user, u) @@ -98,7 +105,7 @@ class ImportScripts::VBulletin < ImportScripts::Base def import_profile_picture(old_user, imported_user) query = mysql_query <<-SQL SELECT filedata, filename - FROM customavatar + FROM #{TABLE_PREFIX}customavatar WHERE userid = #{old_user["userid"]} ORDER BY dateline DESC LIMIT 1 @@ -127,7 +134,7 @@ class ImportScripts::VBulletin < ImportScripts::Base def import_profile_background(old_user, imported_user) query = mysql_query <<-SQL SELECT filedata, filename - FROM customprofilepic + FROM #{TABLE_PREFIX}customprofilepic WHERE userid = #{old_user["userid"]} ORDER BY dateline DESC LIMIT 1 @@ -154,11 +161,11 @@ class ImportScripts::VBulletin < ImportScripts::Base def import_categories puts "", "importing top level categories..." - categories = mysql_query("SELECT forumid, title, description, displayorder, parentid FROM forum ORDER BY forumid").to_a + categories = mysql_query("SELECT forumid, title, description, displayorder, parentid FROM #{TABLE_PREFIX}forum ORDER BY forumid").to_a - top_level_categories = categories.select { |c| c["parentid"] == -1 } + # top_level_categories = categories.select { |c| c["parentid"] == -1 } - create_categories(top_level_categories) do |category| + create_categories(categories) do |category| { id: category["forumid"], name: @htmlentities.decode(category["title"]).strip, @@ -167,27 +174,27 @@ class ImportScripts::VBulletin < ImportScripts::Base } end - puts "", "importing children categories..." - - children_categories = categories.select { |c| c["parentid"] != -1 } - top_level_category_ids = Set.new(top_level_categories.map { |c| c["forumid"] }) - - # cut down the tree to only 2 levels of categories - children_categories.each do |cc| - while !top_level_category_ids.include?(cc["parentid"]) - cc["parentid"] = categories.detect { |c| c["forumid"] == cc["parentid"] }["parentid"] - end - end - - create_categories(children_categories) do |category| - { - id: category["forumid"], - name: @htmlentities.decode(category["title"]).strip, - position: category["displayorder"], - description: @htmlentities.decode(category["description"]).strip, - parent_category_id: category_id_from_imported_category_id(category["parentid"]) - } - end + # puts "", "importing children categories..." + # + # children_categories = categories.select { |c| c["parentid"] != -1 } + # top_level_category_ids = Set.new(top_level_categories.map { |c| c["forumid"] }) + # + # # cut down the tree to only 2 levels of categories + # children_categories.each do |cc| + # while !top_level_category_ids.include?(cc["parentid"]) + # cc["parentid"] = categories.detect { |c| c["forumid"] == cc["parentid"] }["parentid"] + # end + # end + # + # create_categories(children_categories) do |category| + # { + # id: category["forumid"], + # name: @htmlentities.decode(category["title"]).strip, + # position: category["displayorder"], + # description: @htmlentities.decode(category["description"]).strip, + # parent_category_id: category_id_from_imported_category_id(category["parentid"]) + # } + # end end def import_topics @@ -196,14 +203,14 @@ class ImportScripts::VBulletin < ImportScripts::Base # keep track of closed topics @closed_topic_ids = [] - topic_count = mysql_query("SELECT COUNT(threadid) count FROM thread").first["count"] + topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"] batches(BATCH_SIZE) do |offset| topics = mysql_query <<-SQL SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky, p.pagetext raw - FROM thread t - JOIN post p ON p.postid = t.firstpostid + FROM #{TABLE_PREFIX}thread t + JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid ORDER BY t.threadid LIMIT #{BATCH_SIZE} OFFSET #{offset} @@ -237,15 +244,15 @@ class ImportScripts::VBulletin < ImportScripts::Base puts "", "importing posts..." # make sure `firstpostid` is indexed - mysql_query("CREATE INDEX firstpostid_index ON thread (firstpostid)") + mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)") - post_count = mysql_query("SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread)").first["count"] + post_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"] batches(BATCH_SIZE) do |offset| posts = mysql_query <<-SQL SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid - FROM post - WHERE postid NOT IN (SELECT firstpostid FROM thread) + FROM #{TABLE_PREFIX}post + WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread) ORDER BY postid LIMIT #{BATCH_SIZE} OFFSET #{offset} @@ -278,7 +285,7 @@ class ImportScripts::VBulletin < ImportScripts::Base def find_upload(post, attachment_id) sql = "SELECT a.attachmentid attachment_id, a.userid user_id, a.filedataid file_id, a.filename filename, a.caption caption - FROM attachment a + FROM #{TABLE_PREFIX}attachment a WHERE a.attachmentid = #{attachment_id}" results = mysql_query(sql) @@ -310,11 +317,119 @@ class ImportScripts::VBulletin < ImportScripts::Base return nil end + + def import_private_messages + puts "", "importing private messages..." + + topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"] + + batches(BATCH_SIZE) do |offset| + private_messages = mysql_query <<-SQL + SELECT pmtextid, fromuserid, title, message, touserarray, dateline + FROM #{TABLE_PREFIX}pmtext + ORDER BY pmtextid + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if private_messages.size < 1 + next if all_records_exist? :posts, private_messages.map {|pm| "pm-#{pm['pmtextid']}" } + + title_username_of_pm_first_post = {} + + create_posts(private_messages, total: topic_count, offset: offset) do |m| + skip = false + mapped = {} + + mapped[:id] = "pm-#{m['pmtextid']}" + mapped[:user_id] = user_id_from_imported_user_id(m['fromuserid']) || Discourse::SYSTEM_USER_ID + mapped[:raw] = preprocess_post_raw(m['message']) rescue nil + mapped[:created_at] = Time.zone.at(m['dateline']) + title = @htmlentities.decode(m['title']).strip[0...255] + topic_id = nil + + next if mapped[:raw].blank? + + # users who are part of this private message. + target_usernames = [] + target_userids = [] + begin + to_user_array = PHP.unserialize(m['touserarray']) + rescue + puts "#{m['pmtextid']} -- #{m['touserarray']}" + skip = true + end + + begin + to_user_array.each do |to_user| + if to_user[0] == "cc" || to_user[0] == "bcc" # not sure if we should include bcc users + to_user[1].each do |to_user_cc| + user_id = user_id_from_imported_user_id(to_user_cc[0]) + username = User.find_by(id: user_id).try(:username) + target_userids << user_id || Discourse::SYSTEM_USER_ID + target_usernames << username if username + end + else + user_id = user_id_from_imported_user_id(to_user[0]) + username = User.find_by(id: user_id).try(:username) + target_userids << user_id || Discourse::SYSTEM_USER_ID + target_usernames << username if username + end + end + rescue + puts "skipping pm-#{m['pmtextid']} `to_user_array` is not properly serialized -- #{to_user_array.inspect}" + skip = true + end + + participants = target_userids + participants << mapped[:user_id] + begin + participants.sort! + rescue + puts "one of the participant's id is nil -- #{participants.inspect}" + end + + if title =~ /^Re:/ + + parent_id = title_username_of_pm_first_post[[title[3..-1], participants]] + parent_id = title_username_of_pm_first_post[[title[4..-1], participants]] unless parent_id + parent_id = title_username_of_pm_first_post[[title[5..-1], participants]] unless parent_id + parent_id = title_username_of_pm_first_post[[title[6..-1], participants]] unless parent_id + parent_id = title_username_of_pm_first_post[[title[7..-1], participants]] unless parent_id + parent_id = title_username_of_pm_first_post[[title[8..-1], participants]] unless parent_id + if t = topic_lookup_from_imported_post_id("pm-#{parent_id}") + topic_id = t[:topic_id] + end + end + else + title_username_of_pm_first_post[[title, participants]] ||= m['pmtextid'] + end + + unless topic_id + mapped[:title] = title + mapped[:archetype] = Archetype.private_message + mapped[:target_usernames] = target_usernames.join(',') + + if mapped[:target_usernames].empty? # pm with yourself? + # skip = true + mapped[:target_usernames] = "system" + puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})" + end + else + mapped[:topic_id] = topic_id + end + + skip ? nil : mapped + end + end + end + + def import_attachments puts '', 'importing attachments...' current_count = 0 - total_count = mysql_query("SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread)").first["count"] + total_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"] success_count = 0 fail_count = 0 @@ -353,15 +468,15 @@ class ImportScripts::VBulletin < ImportScripts::Base sql = <<-SQL WITH closed_topic_ids AS ( SELECT t.id AS topic_id - FROM post_custom_fields pcf - JOIN posts p ON p.id = pcf.post_id - JOIN topics t ON t.id = p.topic_id + FROM #{TABLE_PREFIX}post_custom_fields pcf + JOIN #{TABLE_PREFIX}posts p ON p.id = pcf.post_id + JOIN #{TABLE_PREFIX}topics t ON t.id = p.topic_id WHERE pcf.name = 'import_id' AND pcf.value IN (?) ) UPDATE topics SET closed = true - WHERE id IN (SELECT topic_id FROM closed_topic_ids) + WHERE id IN (SELECT topic_id FROM #{TABLE_PREFIX}closed_topic_ids) SQL Topic.exec_sql(sql, @closed_topic_ids) @@ -430,7 +545,8 @@ class ImportScripts::VBulletin < ImportScripts::Base .gsub("\u2603", ">") # [URL=...]...[/URL] - raw = raw.gsub(/\[url="?(.+?)"?\](.+)\[\/url\]/i) { "[#{$2}](#{$1})" } + raw.gsub!(/\[url="?([^"]+?)"?\](.*?)\[\/url\]/im) { "[#{$2.strip}](#{$1})" } + raw.gsub!(/\[url="?(.+?)"?\](.+)\[\/url\]/im) { "[#{$2.strip}](#{$1})" } # [URL]...[/URL] # [MP3]...[/MP3] @@ -446,17 +562,11 @@ class ImportScripts::VBulletin < ImportScripts::Base "@#{old_username}" end - # [MENTION=][/MENTION] - # raw = raw.gsub(/\[mention="?(\d+)"?\](.+?)\[\/mention\]/i) do - # user_id, old_username = $1, $2 - # if user = @users.select { |u| u[:userid] == user_id }.first - # old_username = @old_username_to_new_usernames[user[:username]] || user[:username] - # end - # "@#{old_username}" - # end - # [QUOTE]...[/QUOTE] - raw = raw.gsub(/\[quote\](.+?)\[\/quote\]/im) { "\n> #{$1}\n" } + raw.gsub!(/\[quote\](.+?)\[\/quote\]/im) { |quote| + quote.gsub!(/\[quote\](.+?)\[\/quote\]/im) { "\n#{$1}\n" } + quote.gsub!(/\n(.+?)/) { "\n> #{$1}" } + } # [QUOTE=]...[/QUOTE] raw = raw.gsub(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do @@ -473,6 +583,27 @@ class ImportScripts::VBulletin < ImportScripts::Base # [VIDEO=youtube;]...[/VIDEO] raw = raw.gsub(/\[video=youtube;([^\]]+)\].*?\[\/video\]/i) { "\n//youtu.be/#{$1}\n" } + # More Additions .... + + # [spoiler=Some hidden stuff]SPOILER HERE!![/spoiler] + raw.gsub!(/\[spoiler="?(.+?)"?\](.+?)\[\/spoiler\]/im) { "\n#{$1}\n[spoiler]#{$2}[/spoiler]\n" } + + # [IMG][IMG]http://i63.tinypic.com/akga3r.jpg[/IMG][/IMG] + raw.gsub!(/\[IMG\]\[IMG\](.+?)\[\/IMG\]\[\/IMG\]/i) { "[IMG]#{$1}[/IMG]" } + + # convert list tags to ul and list=1 tags to ol + # (basically, we're only missing list=a here...) + # (https://meta.discourse.org/t/phpbb-3-importer-old/17397) + raw.gsub!(/\[list\](.*?)\[\/list\]/im, '[ul]\1[/ul]') + raw.gsub!(/\[list=1\](.*?)\[\/list\]/im, '[ol]\1[/ol]') + raw.gsub!(/\[list\](.*?)\[\/list:u\]/im, '[ul]\1[/ul]') + raw.gsub!(/\[list=1\](.*?)\[\/list:o\]/im, '[ol]\1[/ol]') + # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists: + raw.gsub!(/\[\*\]\n/, '') + raw.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]') + raw.gsub!(/\[\*\](.*?)\n/, '[li]\1[/li]') + + raw end @@ -546,6 +677,68 @@ class ImportScripts::VBulletin < ImportScripts::Base raw end + + def create_permalinks + puts '', 'Creating Permalinks...', '' + + id_mapping = [] + + Topic.listable_topics.find_each do |topic| + pcf = topic.first_post.custom_fields + if pcf && pcf["import_id"] + id = pcf["import_id"].split('-').last + id_mapping.push("XXX#{id} YYY#{topic.id}") + end + end + + # Category.find_each do |cat| + # ccf = cat.custom_fields + # if ccf && ccf["import_id"] + # id = ccf["import_id"].to_i + # id_mapping.push("/forumdisplay.php?#{id} http://forum.quartertothree.com#{cat.url}") + # end + # end + + CSV.open(File.expand_path("../vb_map.csv", __FILE__), "w") do |csv| + id_mapping.each do |value| + csv << [value] + end + end + + end + + + def suspend_users + puts '', "updating banned users" + + banned = 0 + failed = 0 + total = mysql_query("SELECT count(*) count FROM #{TABLE_PREFIX}userban").first['count'] + + system_user = Discourse.system_user + + mysql_query("SELECT userid, bandate FROM #{TABLE_PREFIX}userban").each do |b| + user = User.find_by_id(b['userid']) + if user + user.suspended_at = parse_timestamp(user["bandate"]) + user.suspended_till = 200.years.from_now + + if user.save + StaffActionLogger.new(system_user).log_user_suspend(user, "banned during initial import") + banned += 1 + else + puts "Failed to suspend user #{user.username}. #{user.errors.try(:full_messages).try(:inspect)}" + failed += 1 + end + else + puts "Not found: #{b['userid']}" + failed += 1 + end + + print_status banned + failed, total + end + end + def parse_timestamp(timestamp) Time.zone.at(@tz.utc_to_local(timestamp)) end