From 689f494ee15a53672667872c66ba52e9a8cfcf62 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 30 May 2024 23:50:37 -0400 Subject: [PATCH 1/6] Add `backfillmoderationdecision` management command --- .../commands/backfillmoderationdecision.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 api/api/management/commands/backfillmoderationdecision.py diff --git a/api/api/management/commands/backfillmoderationdecision.py b/api/api/management/commands/backfillmoderationdecision.py new file mode 100644 index 00000000000..d94b768978a --- /dev/null +++ b/api/api/management/commands/backfillmoderationdecision.py @@ -0,0 +1,101 @@ +import argparse + +from django.contrib.auth import get_user_model + +from django_tqdm import BaseCommand + +from api.models import AudioDecision, AudioReport, ImageDecision, ImageReport +from api.models.media import DMCA, MATURE_FILTERED, NO_ACTION, PENDING + + +class Command(BaseCommand): + help = "Back-fill the moderation decision table for a given media type." + + @staticmethod + def add_arguments(parser): + parser.add_argument( + "--dry-run", + help="Count reports to process, and don't do anything else.", + type=bool, + default=True, + action=argparse.BooleanOptionalAction, + ) + parser.add_argument( + "--media-type", + help="The media type to back-fill moderation decisions.", + type=str, + default="image", + choices=["image", "audio"], + ) + parser.add_argument( + "--moderator", + help="The username of the moderator to attribute the decisions to.", + type=str, + default="opener", + ) + + def handle(self, *args, **options): + dry = options["dry_run"] + username = options["moderator"] + media_type = options["media_type"] + + MediaReport = ImageReport + MediaDecision = ImageDecision + if media_type == "audio": + MediaReport = AudioReport + MediaDecision = AudioDecision + + non_pending_reports = MediaReport.objects.filter(decision=None).exclude( + status=PENDING + ) + count_to_process = non_pending_reports.count() + + if dry: + self.info( + f"{count_to_process} {media_type} reports to back-fill. " + f"This is a dry run, exiting without making changes." + ) + return + + if not count_to_process: + self.info("No reports to process.") + return + + t = self.tqdm(total=count_to_process) + + User = get_user_model() + try: + moderator = User.objects.get(username=username) + except User.DoesNotExist: + t.error(f"User '{username}' not found.") + return + + for report in non_pending_reports: + decision = MediaDecision.objects.create( + action=self.get_action(report), + moderator=moderator, + notes="__backfilled_from_report_status", + ) + report.decision = decision + report.save() + t.update(1) + + t.info( + self.style.SUCCESS( + f"Created {count_to_process} {media_type} moderation decisions from existing reports." + ) + ) + + @staticmethod + def get_action(report): + if report.status == MATURE_FILTERED: + return "confirmed_sensitive" + + if report.status == NO_ACTION: + return "rejected_reports" + + # Cases with status = DEINDEXED + if report.reason == DMCA: + return "deindexed_copyright" + + return "deindexed_sensitive" # For reasons MATURE and OTHER From f363f7b99a23b7dae9bd01f9681b0ef8a6089be8 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Fri, 31 May 2024 16:29:52 -0400 Subject: [PATCH 2/6] Add tests --- api/test/factory/models/audio.py | 4 +- api/test/factory/models/oauth2.py | 6 ++ .../test_backfillmoderationdecision.py | 72 +++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 api/test/unit/management/commands/test_backfillmoderationdecision.py diff --git a/api/test/factory/models/audio.py b/api/test/factory/models/audio.py index 0004e39f7ef..74f008a9d1e 100644 --- a/api/test/factory/models/audio.py +++ b/api/test/factory/models/audio.py @@ -3,7 +3,7 @@ from api.models.audio import Audio, AudioAddOn, AudioReport, SensitiveAudio from test.factory.faker import Faker -from test.factory.models.media import IdentifierFactory, MediaFactory +from test.factory.models.media import IdentifierFactory, MediaFactory, MediaReportFactory class SensitiveAudioFactory(DjangoModelFactory): @@ -29,7 +29,7 @@ class Meta: waveform_peaks = Faker("waveform") -class AudioReportFactory(DjangoModelFactory): +class AudioReportFactory(MediaReportFactory): class Meta: model = AudioReport diff --git a/api/test/factory/models/oauth2.py b/api/test/factory/models/oauth2.py index d84f912da4a..7d883802c0b 100644 --- a/api/test/factory/models/oauth2.py +++ b/api/test/factory/models/oauth2.py @@ -1,4 +1,5 @@ from django.utils import timezone +from django.contrib.auth import get_user_model import factory from factory.django import DjangoModelFactory @@ -67,3 +68,8 @@ class Meta: tzinfo=timezone.get_current_timezone(), ) application = factory.SubFactory(ThrottledApplicationFactory) + + +class UserFactory(DjangoModelFactory): + class Meta: + model = get_user_model() diff --git a/api/test/unit/management/commands/test_backfillmoderationdecision.py b/api/test/unit/management/commands/test_backfillmoderationdecision.py new file mode 100644 index 00000000000..e92a26692f4 --- /dev/null +++ b/api/test/unit/management/commands/test_backfillmoderationdecision.py @@ -0,0 +1,72 @@ +from io import StringIO +import pytest + +from django.core.management import call_command + +from api.models import AudioDecision, ImageDecision +from api.models import DMCA, MATURE, OTHER, MATURE_FILTERED, NO_ACTION, DEINDEXED +from test.factory.models.audio import AudioReportFactory +from test.factory.models.image import ImageReportFactory +from test.factory.models.oauth2 import UserFactory + + +def call_cmd(**options): + out = StringIO() + err = StringIO() + call_command( + "backfillmoderationdecision", + **options, + stdout=out, + stderr=err, + ) + res = out.getvalue(), err.getvalue() + print(res) + + return res + + +def make_reports(media_type, reason: str, status: str, count: int = 1): + if media_type == "audio": + AudioReportFactory.create_batch(count, status=status, reason=reason) + else: + ImageReportFactory.create_batch(count, status=status, reason=reason) + +@pytest.mark.parametrize( + ("reason", "status", "expected_action"), + ( + (MATURE, MATURE_FILTERED, "confirmed_sensitive"), + (DMCA, MATURE_FILTERED, "confirmed_sensitive"), + (OTHER, MATURE_FILTERED, "confirmed_sensitive"), + (MATURE, NO_ACTION, "rejected_reports"), + (DMCA, NO_ACTION, "rejected_reports"), + (OTHER, NO_ACTION, "rejected_reports"), + (MATURE, DEINDEXED, "deindexed_sensitive"), + (DMCA, DEINDEXED, "deindexed_copyright"), + (OTHER, DEINDEXED, "deindexed_sensitive"), + ) +) +@pytest.mark.parametrize(("media_type"), ("image", "audio")) +@pytest.mark.django_db +def test_create_moderation_decision_for_reports(media_type, reason, status, expected_action): + username = "opener" + UserFactory.create(username=username) + + make_reports(media_type=media_type, reason=reason, status=status) + + out , err = call_cmd(dry_run=False, media_type=media_type, moderator=username) + + MediaDecision = ImageDecision if media_type == "image" else AudioDecision + assert MediaDecision.objects.count() == 1 + assert f"Created 1 {media_type} moderation decisions from existing reports." in out + + decision = MediaDecision.objects.first() + assert decision.action == expected_action + assert decision.moderator.username == username + + +@pytest.mark.django_db +def test_catch_user_exception(): + make_reports(media_type="image", reason=MATURE, status=MATURE_FILTERED) + _, err = call_cmd(dry_run=False, moderator="nonexistent") + + assert "User 'nonexistent' not found." in err \ No newline at end of file From b0c7c11fb0a4c7bd8a81ccd4dfc290d00ef93c5a Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Fri, 31 May 2024 16:51:14 -0400 Subject: [PATCH 3/6] Lint --- api/test/factory/models/audio.py | 6 ++++- api/test/factory/models/oauth2.py | 2 +- .../test_backfillmoderationdecision.py | 26 ++++++++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/api/test/factory/models/audio.py b/api/test/factory/models/audio.py index 74f008a9d1e..b5a096082af 100644 --- a/api/test/factory/models/audio.py +++ b/api/test/factory/models/audio.py @@ -3,7 +3,11 @@ from api.models.audio import Audio, AudioAddOn, AudioReport, SensitiveAudio from test.factory.faker import Faker -from test.factory.models.media import IdentifierFactory, MediaFactory, MediaReportFactory +from test.factory.models.media import ( + IdentifierFactory, + MediaFactory, + MediaReportFactory, +) class SensitiveAudioFactory(DjangoModelFactory): diff --git a/api/test/factory/models/oauth2.py b/api/test/factory/models/oauth2.py index 7d883802c0b..28f5eac9e39 100644 --- a/api/test/factory/models/oauth2.py +++ b/api/test/factory/models/oauth2.py @@ -1,5 +1,5 @@ -from django.utils import timezone from django.contrib.auth import get_user_model +from django.utils import timezone import factory from factory.django import DjangoModelFactory diff --git a/api/test/unit/management/commands/test_backfillmoderationdecision.py b/api/test/unit/management/commands/test_backfillmoderationdecision.py index e92a26692f4..f609f1aef2c 100644 --- a/api/test/unit/management/commands/test_backfillmoderationdecision.py +++ b/api/test/unit/management/commands/test_backfillmoderationdecision.py @@ -1,10 +1,19 @@ from io import StringIO -import pytest from django.core.management import call_command -from api.models import AudioDecision, ImageDecision -from api.models import DMCA, MATURE, OTHER, MATURE_FILTERED, NO_ACTION, DEINDEXED +import pytest + +from api.models import ( + DEINDEXED, + DMCA, + MATURE, + MATURE_FILTERED, + NO_ACTION, + OTHER, + AudioDecision, + ImageDecision, +) from test.factory.models.audio import AudioReportFactory from test.factory.models.image import ImageReportFactory from test.factory.models.oauth2 import UserFactory @@ -31,6 +40,7 @@ def make_reports(media_type, reason: str, status: str, count: int = 1): else: ImageReportFactory.create_batch(count, status=status, reason=reason) + @pytest.mark.parametrize( ("reason", "status", "expected_action"), ( @@ -43,17 +53,19 @@ def make_reports(media_type, reason: str, status: str, count: int = 1): (MATURE, DEINDEXED, "deindexed_sensitive"), (DMCA, DEINDEXED, "deindexed_copyright"), (OTHER, DEINDEXED, "deindexed_sensitive"), - ) + ), ) @pytest.mark.parametrize(("media_type"), ("image", "audio")) @pytest.mark.django_db -def test_create_moderation_decision_for_reports(media_type, reason, status, expected_action): +def test_create_moderation_decision_for_reports( + media_type, reason, status, expected_action +): username = "opener" UserFactory.create(username=username) make_reports(media_type=media_type, reason=reason, status=status) - out , err = call_cmd(dry_run=False, media_type=media_type, moderator=username) + out, err = call_cmd(dry_run=False, media_type=media_type, moderator=username) MediaDecision = ImageDecision if media_type == "image" else AudioDecision assert MediaDecision.objects.count() == 1 @@ -69,4 +81,4 @@ def test_catch_user_exception(): make_reports(media_type="image", reason=MATURE, status=MATURE_FILTERED) _, err = call_cmd(dry_run=False, moderator="nonexistent") - assert "User 'nonexistent' not found." in err \ No newline at end of file + assert "User 'nonexistent' not found." in err From 4d97b1223c42f1ea0096f63480dbfdfb448091a8 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Tue, 4 Jun 2024 13:37:46 -0400 Subject: [PATCH 4/6] Use `bulk_*` methods --- .../commands/backfillmoderationdecision.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/api/api/management/commands/backfillmoderationdecision.py b/api/api/management/commands/backfillmoderationdecision.py index d94b768978a..68557afe27a 100644 --- a/api/api/management/commands/backfillmoderationdecision.py +++ b/api/api/management/commands/backfillmoderationdecision.py @@ -10,6 +10,7 @@ class Command(BaseCommand): help = "Back-fill the moderation decision table for a given media type." + batch_size = 100 @staticmethod def add_arguments(parser): @@ -61,8 +62,7 @@ def handle(self, *args, **options): self.info("No reports to process.") return - t = self.tqdm(total=count_to_process) - + t = self.tqdm(total=count_to_process // self.batch_size) User = get_user_model() try: moderator = User.objects.get(username=username) @@ -70,14 +70,18 @@ def handle(self, *args, **options): t.error(f"User '{username}' not found.") return - for report in non_pending_reports: - decision = MediaDecision.objects.create( - action=self.get_action(report), - moderator=moderator, - notes="__backfilled_from_report_status", + while reports_chunk := non_pending_reports[: self.batch_size]: + decisions = MediaDecision.objects.bulk_create( + MediaDecision( + action=self.get_action(report), + moderator=moderator, + notes="__backfilled_from_report_status", + ) + for report in reports_chunk ) - report.decision = decision - report.save() + for report, decision in zip(reports_chunk, decisions): + report.decision = decision + MediaReport.objects.bulk_update(reports_chunk, ["decision"]) t.update(1) t.info( From 65daed2698b6d53c68cf7da09b8758b0dd60ffec Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Tue, 4 Jun 2024 14:16:10 -0400 Subject: [PATCH 5/6] Use `DecisionAction` choices class --- .../commands/backfillmoderationdecision.py | 11 ++++++----- .../test_backfillmoderationdecision.py | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/api/api/management/commands/backfillmoderationdecision.py b/api/api/management/commands/backfillmoderationdecision.py index 68557afe27a..f88e40769ab 100644 --- a/api/api/management/commands/backfillmoderationdecision.py +++ b/api/api/management/commands/backfillmoderationdecision.py @@ -4,13 +4,14 @@ from django_tqdm import BaseCommand +from api.constants.moderation import DecisionAction from api.models import AudioDecision, AudioReport, ImageDecision, ImageReport from api.models.media import DMCA, MATURE_FILTERED, NO_ACTION, PENDING class Command(BaseCommand): help = "Back-fill the moderation decision table for a given media type." - batch_size = 100 + batch_size = 3 @staticmethod def add_arguments(parser): @@ -93,13 +94,13 @@ def handle(self, *args, **options): @staticmethod def get_action(report): if report.status == MATURE_FILTERED: - return "confirmed_sensitive" + return DecisionAction.MARKED_SENSITIVE if report.status == NO_ACTION: - return "rejected_reports" + return DecisionAction.REJECTED_REPORTS # Cases with status = DEINDEXED if report.reason == DMCA: - return "deindexed_copyright" + return DecisionAction.DEINDEXED_COPYRIGHT - return "deindexed_sensitive" # For reasons MATURE and OTHER + return DecisionAction.DEINDEXED_SENSITIVE # For reasons MATURE and OTHER diff --git a/api/test/unit/management/commands/test_backfillmoderationdecision.py b/api/test/unit/management/commands/test_backfillmoderationdecision.py index f609f1aef2c..f6c18575392 100644 --- a/api/test/unit/management/commands/test_backfillmoderationdecision.py +++ b/api/test/unit/management/commands/test_backfillmoderationdecision.py @@ -4,6 +4,7 @@ import pytest +from api.constants.moderation import DecisionAction from api.models import ( DEINDEXED, DMCA, @@ -44,15 +45,15 @@ def make_reports(media_type, reason: str, status: str, count: int = 1): @pytest.mark.parametrize( ("reason", "status", "expected_action"), ( - (MATURE, MATURE_FILTERED, "confirmed_sensitive"), - (DMCA, MATURE_FILTERED, "confirmed_sensitive"), - (OTHER, MATURE_FILTERED, "confirmed_sensitive"), - (MATURE, NO_ACTION, "rejected_reports"), - (DMCA, NO_ACTION, "rejected_reports"), - (OTHER, NO_ACTION, "rejected_reports"), - (MATURE, DEINDEXED, "deindexed_sensitive"), - (DMCA, DEINDEXED, "deindexed_copyright"), - (OTHER, DEINDEXED, "deindexed_sensitive"), + (MATURE, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (DMCA, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (OTHER, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (MATURE, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (DMCA, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (OTHER, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (MATURE, DEINDEXED, DecisionAction.DEINDEXED_SENSITIVE), + (DMCA, DEINDEXED, DecisionAction.DEINDEXED_COPYRIGHT), + (OTHER, DEINDEXED, DecisionAction.DEINDEXED_SENSITIVE), ), ) @pytest.mark.parametrize(("media_type"), ("image", "audio")) From df449e4a33ec6d22b0b73c426adb5b80e05aec8d Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 5 Jun 2024 17:20:39 -0400 Subject: [PATCH 6/6] Create `MediaDecisionThrough` rows too --- .../commands/backfillmoderationdecision.py | 17 ++++++++++++++++- .../commands/test_backfillmoderationdecision.py | 16 +++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/api/api/management/commands/backfillmoderationdecision.py b/api/api/management/commands/backfillmoderationdecision.py index f88e40769ab..2b3598a3a66 100644 --- a/api/api/management/commands/backfillmoderationdecision.py +++ b/api/api/management/commands/backfillmoderationdecision.py @@ -5,7 +5,14 @@ from django_tqdm import BaseCommand from api.constants.moderation import DecisionAction -from api.models import AudioDecision, AudioReport, ImageDecision, ImageReport +from api.models import ( + AudioDecision, + AudioDecisionThrough, + AudioReport, + ImageDecision, + ImageDecisionThrough, + ImageReport, +) from api.models.media import DMCA, MATURE_FILTERED, NO_ACTION, PENDING @@ -43,9 +50,11 @@ def handle(self, *args, **options): MediaReport = ImageReport MediaDecision = ImageDecision + MediaDecisionThrough = ImageDecisionThrough if media_type == "audio": MediaReport = AudioReport MediaDecision = AudioDecision + MediaDecisionThrough = AudioDecisionThrough non_pending_reports = MediaReport.objects.filter(decision=None).exclude( status=PENDING @@ -83,6 +92,12 @@ def handle(self, *args, **options): for report, decision in zip(reports_chunk, decisions): report.decision = decision MediaReport.objects.bulk_update(reports_chunk, ["decision"]) + MediaDecisionThrough.objects.bulk_create( + [ + MediaDecisionThrough(media_obj=report.media_obj, decision=decision) + for report, decision in zip(reports_chunk, decisions) + ] + ) t.update(1) t.info( diff --git a/api/test/unit/management/commands/test_backfillmoderationdecision.py b/api/test/unit/management/commands/test_backfillmoderationdecision.py index f6c18575392..9c6e517dd91 100644 --- a/api/test/unit/management/commands/test_backfillmoderationdecision.py +++ b/api/test/unit/management/commands/test_backfillmoderationdecision.py @@ -13,7 +13,9 @@ NO_ACTION, OTHER, AudioDecision, + AudioDecisionThrough, ImageDecision, + ImageDecisionThrough, ) from test.factory.models.audio import AudioReportFactory from test.factory.models.image import ImageReportFactory @@ -37,9 +39,9 @@ def call_cmd(**options): def make_reports(media_type, reason: str, status: str, count: int = 1): if media_type == "audio": - AudioReportFactory.create_batch(count, status=status, reason=reason) + return AudioReportFactory.create_batch(count, status=status, reason=reason) else: - ImageReportFactory.create_batch(count, status=status, reason=reason) + return ImageReportFactory.create_batch(count, status=status, reason=reason) @pytest.mark.parametrize( @@ -64,18 +66,26 @@ def test_create_moderation_decision_for_reports( username = "opener" UserFactory.create(username=username) - make_reports(media_type=media_type, reason=reason, status=status) + report = make_reports(media_type=media_type, reason=reason, status=status)[0] out, err = call_cmd(dry_run=False, media_type=media_type, moderator=username) MediaDecision = ImageDecision if media_type == "image" else AudioDecision + MediaDecisionThrough = ( + ImageDecisionThrough if media_type == "image" else AudioDecisionThrough + ) assert MediaDecision.objects.count() == 1 assert f"Created 1 {media_type} moderation decisions from existing reports." in out decision = MediaDecision.objects.first() + assert decision.media_objs.count() == 1 assert decision.action == expected_action assert decision.moderator.username == username + decision_through = MediaDecisionThrough.objects.first() + assert decision_through.media_obj == report.media_obj + assert decision_through.decision == decision + @pytest.mark.django_db def test_catch_user_exception():