diff --git a/app/jobs/regular/sync_acls_for_uploads.rb b/app/jobs/regular/sync_acls_for_uploads.rb index d3533ff5ab..73837ff7e7 100644 --- a/app/jobs/regular/sync_acls_for_uploads.rb +++ b/app/jobs/regular/sync_acls_for_uploads.rb @@ -18,7 +18,18 @@ module Jobs Rails.logger.warn("Syncing ACL for upload ids: #{args[:upload_ids].join(", ")}") Upload.includes(:optimized_images).where(id: args[:upload_ids]).find_in_batches do |uploads| uploads.each do |upload| - Discourse.store.update_upload_ACL(upload, optimized_images_preloaded: true) + begin + Discourse.store.update_upload_ACL(upload, optimized_images_preloaded: true) + rescue => err + Discourse.warn_exception( + err, + message: "Failed to update upload ACL", + env: { + upload_id: upload.id, + filename: upload.original_filename + } + ) + end end end Rails.logger.warn("Completed syncing ACL for upload ids in #{time.to_s}. IDs: #{args[:upload_ids].join(", ")}") diff --git a/lib/tasks/uploads.rake b/lib/tasks/uploads.rake index 3d24f95695..051b4e9531 100644 --- a/lib/tasks/uploads.rake +++ b/lib/tasks/uploads.rake @@ -506,10 +506,14 @@ task "uploads:sync_s3_acls" => :environment do exit 1 end - puts "CAUTION: This task may take a long time to complete!" + puts "CAUTION: This task may take a long time to complete! There are #{Upload.count} uploads to sync ACLs for." + puts "" puts "-" * 30 puts "Uploads marked as secure will get a private ACL, and uploads marked as not secure will get a public ACL." - adjust_acls(Upload.find_each(batch_size: 100)) + puts "Upload ACLs will be updated in Sidekiq jobs in batches of 100 at a time, check Sidekiq queues for SyncAclsForUploads for progress." + Upload.select(:id).find_in_batches(batch_size: 100) do |uploads| + adjust_acls(uploads.map(&:id)) + end puts "", "Upload ACL sync complete!" end end @@ -525,43 +529,30 @@ task "uploads:disable_secure_media" => :environment do SiteSetting.secure_media = false - secure_uploads = Upload.includes(:posts).where(secure: true) + secure_uploads = Upload.joins(:post_uploads).where(secure: true) secure_upload_count = secure_uploads.count - uploads_to_adjust_acl_for = [] - posts_to_rebake = {} - - i = 0 - secure_uploads.find_each(batch_size: 20).each do |upload| - uploads_to_adjust_acl_for << upload - - upload.posts.each do |post| - # don't want unnecessary double-ups - next if posts_to_rebake.key?(post.id) - posts_to_rebake[post.id] = post - end - - i += 1 - end + secure_upload_ids = secure_uploads.pluck(:id) puts "", "Marking #{secure_upload_count} uploads as not secure.", "" - secure_uploads.update_all(secure: false) + secure_uploads.update_all( + secure: false, + security_last_changed_at: Time.zone.now, + security_last_changed_reason: "marked as not secure by disable_secure_media task" + ) - adjust_acls(uploads_to_adjust_acl_for) - post_rebake_errors = rebake_upload_posts(posts_to_rebake) + post_ids_to_rebake = DB.query_single( + "SELECT DISTINCT post_id FROM post_uploads WHERE upload_id IN (?)", secure_upload_ids + ) + adjust_acls(secure_upload_ids) + post_rebake_errors = rebake_upload_posts(post_ids_to_rebake) log_rebake_errors(post_rebake_errors) - RakeHelpers.print_status_with_label("Rebaking and updating complete! ", i, secure_upload_count) + puts "", "Rebaking and uploading complete!", "" end puts "", "Secure media is now disabled!", "" end -# Renamed to uploads:secure_upload_analyse_and_update -task "uploads:ensure_correct_acl" => :environment do - puts "This task has been deprecated, run uploads:secure_upload_analyse_and_update task instead." - exit 1 -end - ## # Run this task whenever the secure_media or login_required # settings are changed for a Discourse instance to update @@ -576,10 +567,8 @@ task "uploads:secure_upload_analyse_and_update" => :environment do end puts "Analyzing security for uploads in #{db}...", "" - upload_ids_to_mark_as_secure, upload_ids_to_mark_as_not_secure, posts_to_rebake, uploads_to_adjust_acl_for = nil + all_upload_ids_changed, post_ids_to_rebake = nil Upload.transaction do - mark_secure_in_loop_because_no_login_required = false - # If secure media is enabled we need to first set the access control post of # all post uploads (even uploads that are linked to multiple posts). If the # upload is not set to secure media then this has no other effect on the upload, @@ -589,73 +578,77 @@ task "uploads:secure_upload_analyse_and_update" => :environment do update_uploads_access_control_post end - # Get all uploads in the database, including optimized images. Both media (images, videos, - # etc) along with attachments (pdfs, txt, etc.) must be loaded because all can be marked as - # secure based on site settings. - uploads_to_update = Upload.includes(:posts, :optimized_images).joins(:post_uploads) - - puts "There are #{uploads_to_update.count} upload(s) that could be marked secure.", "" - - # Simply mark all these uploads as secure if login_required because no anons will be able to access them + puts "", "Analysing which uploads need to be marked secure and be rebaked.", "" if SiteSetting.login_required? - mark_secure_in_loop_because_no_login_required = false + # Simply mark all uploads linked to posts secure if login_required because no anons will be able to access them. + post_ids_to_rebake, all_upload_ids_changed = mark_all_as_secure_login_required else - - # If NOT login_required, then we have to go for the other slower flow, where in the loop - # we mark the upload secure based on UploadSecurity.should_be_secure? - mark_secure_in_loop_because_no_login_required = true - puts "Marking posts as secure in the next step because login_required is false." - end - - puts "", "Analysing which of #{uploads_to_update.count} uploads need to be marked secure and be rebaked.", "" - - upload_ids_to_mark_as_secure, - upload_ids_to_mark_as_not_secure, - posts_to_rebake, - uploads_to_adjust_acl_for = determine_upload_security_and_posts_to_rebake( - uploads_to_update, mark_secure_in_loop_because_no_login_required - ) - - if !SiteSetting.login_required? - update_specific_upload_security_no_login_required(upload_ids_to_mark_as_secure, upload_ids_to_mark_as_not_secure) - else - mark_all_as_secure_login_required(uploads_to_update) + # Otherwise only mark uploads linked to posts in secure categories or PMs as secure. + post_ids_to_rebake, all_upload_ids_changed = update_specific_upload_security_no_login_required end end # Enqueue rebakes AFTER upload transaction complete, so there is no race condition # between updating the DB and the rebakes occurring. - post_rebake_errors = rebake_upload_posts(posts_to_rebake) + post_rebake_errors = rebake_upload_posts(post_ids_to_rebake) log_rebake_errors(post_rebake_errors) # Also do this AFTER upload transaction complete so we don't end up with any # errors leaving ACLs in a bad state (the ACL sync task can be run to fix any # outliers at any time). - adjust_acls(uploads_to_adjust_acl_for) + adjust_acls(all_upload_ids_changed) end puts "", "", "Done!" end -def adjust_acls(uploads_to_adjust_acl_for) - total_count = uploads_to_adjust_acl_for.respond_to?(:length) ? uploads_to_adjust_acl_for.length : uploads_to_adjust_acl_for.count - puts "", "Updating ACL for #{total_count} uploads.", "" - i = 0 - uploads_to_adjust_acl_for.each do |upload| - RakeHelpers.print_status_with_label("Updating ACL for upload.......", i, total_count) - Discourse.store.update_upload_ACL(upload) - i += 1 +def adjust_acls(upload_ids_to_adjust_acl_for) + jobs_to_create = (upload_ids_to_adjust_acl_for.count.to_f / 100.00).ceil + + if jobs_to_create > 1 + puts "Adjusting ACLs for #{upload_ids_to_adjust_acl_for} uploads. These will be batched across #{jobs_to_create} sync job(s)." + end + + upload_ids_to_adjust_acl_for.each_slice(100) do |upload_ids| + Jobs.enqueue(:sync_acls_for_uploads, upload_ids: upload_ids) + end + + if jobs_to_create > 1 + puts "ACL batching complete. Keep an eye on the Sidekiq queue for progress." end - RakeHelpers.print_status_with_label("Updating ACLs complete! ", i, total_count) end -def mark_all_as_secure_login_required(uploads_to_update) - puts "Marking #{uploads_to_update.count} upload(s) as secure because login_required is true.", "" - uploads_to_update.update_all( - secure: true, - security_last_changed_at: Time.zone.now, - security_last_changed_reason: "upload security rake task all secure login required" - ) +def mark_all_as_secure_login_required + post_upload_ids_marked_secure = DB.query_single(<<~SQL) + WITH upl AS ( + SELECT DISTINCT ON (upload_id) upload_id + FROM post_uploads + INNER JOIN posts ON posts.id = post_uploads.post_id + INNER JOIN topics ON topics.id = posts.topic_id + ) + UPDATE uploads + SET secure = true, + security_last_changed_reason = 'upload security rake task mark as secure', + security_last_changed_at = NOW() + FROM upl + WHERE uploads.id = upl.upload_id AND NOT uploads.secure + RETURNING uploads.id + SQL + puts "Marked #{post_upload_ids_marked_secure.count} upload(s) as secure because login_required is true.", "" + upload_ids_marked_not_secure = DB.query_single(<<~SQL, post_upload_ids_marked_secure) + UPDATE uploads + SET secure = false, + security_last_changed_reason = 'upload security rake task mark as not secure', + security_last_changed_at = NOW() + WHERE id NOT IN (?) AND uploads.secure + RETURNING uploads.id + SQL + puts "Marked #{upload_ids_marked_not_secure.count} upload(s) as not secure because they are not linked to posts.", "" puts "Finished marking upload(s) as secure." + + post_ids_to_rebake = DB.query_single( + "SELECT DISTINCT post_id FROM post_uploads WHERE upload_id IN (?)", post_upload_ids_marked_secure + ) + [post_ids_to_rebake, (post_upload_ids_marked_secure + upload_ids_marked_not_secure).uniq] end def log_rebake_errors(rebake_errors) @@ -666,45 +659,81 @@ def log_rebake_errors(rebake_errors) end end -def update_specific_upload_security_no_login_required(upload_ids_to_mark_as_secure, upload_ids_to_mark_as_not_secure) - if upload_ids_to_mark_as_secure.any? - puts "Marking #{upload_ids_to_mark_as_secure.length} uploads as secure because UploadSecurity determined them to be secure." - Upload.where(id: upload_ids_to_mark_as_secure).update_all( - secure: true, - security_last_changed_at: Time.zone.now, - security_last_changed_reason: "upload security rake task mark as secure" +def update_specific_upload_security_no_login_required + # A simplification of the rules found in UploadSecurity which is a lot faster than + # having to loop through records and use that class to check security. + post_upload_ids_marked_secure = DB.query_single(<<~SQL) + WITH upl AS ( + SELECT DISTINCT ON (upload_id) upload_id + FROM post_uploads + INNER JOIN posts ON posts.id = post_uploads.post_id + INNER JOIN topics ON topics.id = posts.topic_id + LEFT JOIN categories ON categories.id = topics.category_id + WHERE (topics.category_id IS NOT NULL AND categories.read_restricted) OR + (topics.archetype = 'private_message') ) - end - if upload_ids_to_mark_as_not_secure.any? - puts "Marking #{upload_ids_to_mark_as_not_secure.length} uploads as not secure because UploadSecurity determined them to be not secure." - Upload.where(id: upload_ids_to_mark_as_not_secure).update_all( - secure: false, - security_last_changed_at: Time.zone.now, - security_last_changed_reason: "upload security rake task mark as not secure" + UPDATE uploads + SET secure = true, + security_last_changed_reason = 'upload security rake task mark as secure', + security_last_changed_at = NOW() + FROM upl + WHERE uploads.id = upl.upload_id AND NOT uploads.secure + RETURNING uploads.id + SQL + puts "Marked #{post_upload_ids_marked_secure.length} uploads as secure." + + # Anything in a public category or a regular topic should not be secure. + post_upload_ids_marked_not_secure = DB.query_single(<<~SQL) + WITH upl AS ( + SELECT DISTINCT ON (upload_id) upload_id + FROM post_uploads + INNER JOIN posts ON posts.id = post_uploads.post_id + INNER JOIN topics ON topics.id = posts.topic_id + LEFT JOIN categories ON categories.id = topics.category_id + WHERE (topics.archetype = 'regular' AND topics.category_id IS NOT NULL AND NOT categories.read_restricted) OR + (topics.archetype = 'regular' AND topics.category_id IS NULL) ) - end - puts "Finished updating upload security." + UPDATE uploads + SET secure = false, + security_last_changed_reason = 'upload security rake task mark as not secure', + security_last_changed_at = NOW() + FROM upl + WHERE uploads.id = upl.upload_id AND uploads.secure + RETURNING uploads.id + SQL + puts "Marked #{post_upload_ids_marked_not_secure.length} uploads as not secure." + + # Everything else should not be secure! + upload_ids_changed = (post_upload_ids_marked_secure + post_upload_ids_marked_not_secure).uniq + upload_ids_marked_not_secure = DB.query_single(<<~SQL, upload_ids_changed) + UPDATE uploads + SET secure = false, + security_last_changed_reason = 'upload security rake task mark as not secure', + security_last_changed_at = NOW() + WHERE id NOT IN (?) AND uploads.secure + RETURNING uploads.id + SQL + puts "Finished updating upload security. Marked #{upload_ids_marked_not_secure.length} uploads not linked to posts as not secure." + + all_upload_ids_changed = (upload_ids_changed + upload_ids_marked_not_secure).uniq + post_ids_to_rebake = DB.query_single("SELECT DISTINCT post_id FROM post_uploads WHERE upload_id IN (?)", upload_ids_changed) + [post_ids_to_rebake, all_upload_ids_changed] end def update_uploads_access_control_post - access_control_post_updates = [] - uploads_with_post_ids = DB.query(<<-SQL - SELECT upload_id, ( - SELECT string_agg(CAST(post_uploads.post_id AS varchar), ',' ORDER BY post_uploads.id) as post_ids - FROM post_uploads - WHERE pu.upload_id = post_uploads.upload_id - ) FROM post_uploads pu + DB.exec(<<~SQL) + WITH upl AS ( + SELECT DISTINCT ON (upload_id) upload_id, post_id FROM post_uploads ORDER BY upload_id, post_id + ) + UPDATE uploads + SET access_control_post_id = upl.post_id + FROM upl + WHERE uploads.id = upl.upload_id SQL - ) - uploads_with_post_ids.each do |row| - first_post_id = row.post_ids.split(",").first.to_i - access_control_post_updates << "UPDATE uploads SET access_control_post_id = #{first_post_id} WHERE id = #{row.upload_id};" - end - DB.exec(access_control_post_updates.join("\n")) end -def rebake_upload_posts(posts_to_rebake) - posts_to_rebake = posts_to_rebake.values +def rebake_upload_posts(post_ids_to_rebake) + posts_to_rebake = Post.where(id: post_ids_to_rebake) post_rebake_errors = [] puts "", "Rebaking #{posts_to_rebake.length} posts with affected uploads.", "" begin @@ -723,69 +752,6 @@ def rebake_upload_posts(posts_to_rebake) post_rebake_errors end -def determine_upload_security_and_posts_to_rebake(uploads_to_update, mark_secure_in_loop_because_no_login_required) - upload_ids_to_mark_as_secure = [] - upload_ids_to_mark_as_not_secure = [] - uploads_to_adjust_acl_for = [] - posts_to_rebake = {} - - # we do this to avoid a heavier post query, and to make sure we only - # get unique posts AND include deleted posts (unscoped) - unique_access_control_posts = {} - Post.unscoped.select(:id, :topic_id) - .includes(topic: :category) - .where(id: uploads_to_update.pluck(:access_control_post_id).uniq).find_each do |post| - unique_access_control_posts[post.id] = post - end - - i = 0 - uploads_to_update.find_each do |upload_to_update| - - # fetch the post out of the already populated map to avoid n1s - upload_to_update.access_control_post = unique_access_control_posts[upload_to_update.access_control_post_id] - - # we just need to determine the post security here so the ACL is set to the correct thing, - # because the update_upload_ACL method uses upload.secure? - original_update_secure_status = upload_to_update.secure - upload_to_update.secure = UploadSecurity.new(upload_to_update).should_be_secure? - - # no point changing ACLs or rebaking or doing any such shenanigans - # when the secure status hasn't even changed! - if original_update_secure_status == upload_to_update.secure - i += 1 - next - end - - # we only want to update the acl later once the secure status - # has been saved in the DB; otherwise if there is a later failure - # we get stuck with an incorrect ACL in S3 - uploads_to_adjust_acl_for << upload_to_update - RakeHelpers.print_status_with_label("Analysing which upload posts to rebake.....", i, uploads_to_update.count) - upload_to_update.posts.each do |post| - # don't want unnecessary double-ups - next if posts_to_rebake.key?(post.id) - posts_to_rebake[post.id] = post - end - - # some uploads will be marked as not secure here. - # we need to address this with upload_ids_to_mark_as_not_secure - # e.g. turning off SiteSetting.login_required - if mark_secure_in_loop_because_no_login_required - if upload_to_update.secure? - upload_ids_to_mark_as_secure << upload_to_update.id - else - upload_ids_to_mark_as_not_secure << upload_to_update.id - end - end - - i += 1 - end - RakeHelpers.print_status_with_label("Analysis complete! ", i, uploads_to_update.count) - puts "" - - [upload_ids_to_mark_as_secure, upload_ids_to_mark_as_not_secure, posts_to_rebake, uploads_to_adjust_acl_for] -end - def inline_uploads(post) replaced = false diff --git a/lib/upload_security.rb b/lib/upload_security.rb index cac17d3bba..6eead29250 100644 --- a/lib/upload_security.rb +++ b/lib/upload_security.rb @@ -15,6 +15,11 @@ # on the current secure? status, otherwise there would be a lot of additional # complex queries and joins to perform. Over time more of these specific # queries will be implemented. +# +# NOTE: When updating this to add more cases where uploads will be marked +# secure, consider uploads:secure_upload_analyse_and_update as well, which +# does not use this class directly but uses an SQL version of its rules for +# efficient updating of many uploads in bulk. class UploadSecurity @@custom_public_types = [] diff --git a/spec/jobs/sync_acls_for_uploads_spec.rb b/spec/jobs/sync_acls_for_uploads_spec.rb index a9f9017c3e..c0c07b3f5d 100644 --- a/spec/jobs/sync_acls_for_uploads_spec.rb +++ b/spec/jobs/sync_acls_for_uploads_spec.rb @@ -27,5 +27,11 @@ describe Jobs::SyncAclsForUploads do Discourse.store.expects(:update_upload_ACL).times(3) run_job end + + it "handles updates throwing an exception" do + Discourse.store.expects(:update_upload_ACL).raises(StandardError).then.returns(true, true).times(3) + Discourse.expects(:warn_exception).once + run_job + end end end diff --git a/spec/tasks/uploads_spec.rb b/spec/tasks/uploads_spec.rb index 5b42663ebb..1fda86984e 100644 --- a/spec/tasks/uploads_spec.rb +++ b/spec/tasks/uploads_spec.rb @@ -65,6 +65,24 @@ RSpec.describe "tasks/uploads" do expect(upload3.reload.access_control_post).to eq(post3) end + it "sets everything attached to a post as secure and rebakes all those posts if login is required" do + SiteSetting.login_required = true + freeze_time + + post1.update_columns(baked_at: 1.week.ago) + post2.update_columns(baked_at: 1.week.ago) + post3.update_columns(baked_at: 1.week.ago) + + invoke_task + + expect(post1.reload.baked_at).not_to eq_time(1.week.ago) + expect(post2.reload.baked_at).not_to eq_time(1.week.ago) + expect(post3.reload.baked_at).not_to eq_time(1.week.ago) + expect(upload2.reload.secure).to eq(true) + expect(upload1.reload.secure).to eq(true) + expect(upload3.reload.secure).to eq(true) + end + it "sets the uploads that are media and attachments in the read restricted topic category to secure" do post3.topic.update(category: Fabricate(:private_category, group: Fabricate(:group))) invoke_task @@ -192,8 +210,12 @@ RSpec.describe "tasks/uploads" do end it "updates the affected ACLs" do - FileStore::S3Store.any_instance.expects(:update_upload_ACL).times(4) - invoke_task + expect_enqueued_with( + job: :sync_acls_for_uploads, + args: { upload_ids: [upload1.id, upload2.id, upload3.id, upload4.id] }, + ) do + invoke_task + end end end end