-
Notifications
You must be signed in to change notification settings - Fork 4.3k
feat: [AXM-2307] add management command to backfill ContentDate model… #37989
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,271 @@ | ||
| """ | ||
| Django management command to extract assignment dates from modulestore and populate ContentDate table. | ||
| """ | ||
|
|
||
| import logging | ||
| from typing import List | ||
|
|
||
| from django.contrib.auth import get_user_model | ||
| from django.core.management.base import BaseCommand, CommandError | ||
| from django.db import transaction | ||
| from edx_when.api import update_or_create_assignments_due_dates, models as when_models | ||
| from opaque_keys import InvalidKeyError | ||
| from opaque_keys.edx.keys import CourseKey | ||
| from xmodule.modulestore.django import modulestore | ||
| from xmodule.modulestore.exceptions import ItemNotFoundError | ||
|
|
||
| from lms.djangoapps.courseware.courses import get_course_assignments | ||
| from openedx.core.djangoapps.content.course_overviews.models import CourseOverview | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
| User = get_user_model() | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| """ | ||
| Management command to seed ContentDate table with assignment due dates from modulestore. | ||
|
|
||
| Example usage: | ||
| # Dry run for all courses | ||
| python manage.py lms seed_content_dates --dry-run | ||
|
|
||
| # Seed specific course | ||
| python manage.py lms seed_content_dates --course-id "course-v1:MITx+6.00x+2023_Fall" | ||
|
|
||
| # Seed all courses for specific org | ||
| python manage.py lms seed_content_dates --org "MITx" | ||
|
|
||
| # Force update existing entries | ||
| python manage.py lms seed_content_dates --force-update | ||
| """ | ||
|
|
||
| help = "Extract assignment dates from modulestore and populate ContentDate table" | ||
|
|
||
| def add_arguments(self, parser): | ||
| parser.add_argument( | ||
| "--course-id", | ||
| type=str, | ||
| help='Specific course ID to process (e.g., "course-v1:MITx+6.00x+2023_Fall")', | ||
| ) | ||
| parser.add_argument("--org", type=str, help="Organization to filter courses by") | ||
| parser.add_argument( | ||
| "--dry-run", | ||
| action="store_true", | ||
| help="Show what would be processed without making changes", | ||
| ) | ||
| parser.add_argument( | ||
| "--force-update", | ||
| action="store_true", | ||
| help="Update existing ContentDate entries (default: skip existing)", | ||
| ) | ||
| parser.add_argument( | ||
| "--batch-size", | ||
| type=int, | ||
| default=100, | ||
| help="Number of assignments to process in each batch (default: 100)", | ||
| ) | ||
|
|
||
| def handle(self, *args, **options): | ||
| self.dry_run = options["dry_run"] # pylint: disable=attribute-defined-outside-init | ||
| self.force_update = options["force_update"] # pylint: disable=attribute-defined-outside-init | ||
| self.batch_size = options["batch_size"] # pylint: disable=attribute-defined-outside-init | ||
|
|
||
| if self.dry_run: | ||
| self.stdout.write( | ||
| self.style.WARNING( | ||
| "DRY RUN MODE: No changes will be made to the database" | ||
| ) | ||
| ) | ||
|
|
||
| try: | ||
| course_keys = self._get_course_keys(options) | ||
|
|
||
| total_processed = 0 | ||
| total_created = 0 | ||
| total_updated = 0 | ||
| total_skipped = 0 | ||
|
|
||
| for course_key in course_keys: | ||
| self.stdout.write(f"Processing course: {course_key}") | ||
|
|
||
| try: | ||
| processed, created, updated, skipped = self._process_course( | ||
| course_key | ||
| ) | ||
| total_processed += processed | ||
| total_created += created | ||
| total_updated += updated | ||
| total_skipped += skipped | ||
|
|
||
| self.stdout.write( | ||
| f" Course {course_key}: {processed} assignments processed, " | ||
| f"{created} created, {updated} updated, {skipped} skipped" | ||
| ) | ||
|
|
||
| except Exception as e: # pylint: disable=broad-exception-caught | ||
| self.stdout.write( | ||
| self.style.ERROR( | ||
| f"Error processing course {course_key}: {str(e)}" | ||
| ) | ||
| ) | ||
| log.exception(f"Error processing course {course_key}") | ||
| continue | ||
|
|
||
| self.stdout.write( | ||
| self.style.SUCCESS( | ||
| f"\nSUMMARY:\n" | ||
| f"Total assignments processed: {total_processed}\n" | ||
| f"Total created: {total_created}\n" | ||
| f"Total updated: {total_updated}\n" | ||
| f"Total skipped: {total_skipped}" | ||
| ) | ||
| ) | ||
|
|
||
| except CommandError: | ||
| raise | ||
| except Exception as e: | ||
| raise CommandError(f"Command failed: {str(e)}") from e | ||
|
|
||
| def _get_course_keys(self, options) -> List[CourseKey]: | ||
| """ | ||
| Get list of course keys to process based on command options. | ||
| """ | ||
| course_keys = [] | ||
|
|
||
| if options["course_id"]: | ||
| try: | ||
| course_key = CourseKey.from_string(options["course_id"]) | ||
|
|
||
| if not CourseOverview.objects.filter(id=course_key).exists(): | ||
| raise CommandError(f"Course not found: {options['course_id']}") | ||
| course_keys.append(course_key) | ||
| except InvalidKeyError as e: | ||
| raise CommandError(f"Invalid course ID format: {options['course_id']}") from e | ||
|
|
||
| else: | ||
| queryset = CourseOverview.objects.all() | ||
| if options["org"]: | ||
| queryset = queryset.filter(org=options["org"]) | ||
|
|
||
| course_keys = [overview.id for overview in queryset] | ||
|
|
||
| if not course_keys: | ||
| filter_msg = f" for org '{options['org']}'" if options["org"] else "" | ||
| raise CommandError(f"No courses found{filter_msg}") | ||
|
|
||
| return course_keys | ||
|
|
||
| def _process_course(self, course_key: CourseKey) -> tuple[int, int, int, int]: | ||
| """ | ||
| Process a single course and return (processed, created, updated, skipped) counts. | ||
| """ | ||
| store = modulestore() | ||
|
|
||
| try: | ||
| course = store.get_course(course_key) | ||
| if not course: | ||
| raise ItemNotFoundError( | ||
| f"Course not found in modulestore: {course_key}" | ||
| ) | ||
| except ItemNotFoundError: | ||
| log.warning(f"Course not found in modulestore: {course_key}") | ||
| return (0, 0, 0, 0) | ||
|
|
||
| staff_user = User.objects.filter(is_staff=True).first() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pattern is unacceptable in my opinion. We need a more robust approach that doesn't pick any old staff user for the convenience of the management command. |
||
| if not staff_user: | ||
| log.warning("No staff user found; cannot load assignments") | ||
| return (0, 0, 0, 0) | ||
| assignments = get_course_assignments(course_key, staff_user) | ||
|
|
||
| if not assignments: | ||
| log.info(f"No assignments with due dates found in course: {course_key}") | ||
| return (0, 0, 0, 0) | ||
|
|
||
| processed = len(assignments) | ||
| created = 0 | ||
| updated = 0 | ||
| skipped = 0 | ||
|
|
||
| if self.dry_run: | ||
| self.stdout.write(f" Would process {processed} assignments") | ||
| for assignment in assignments[:5]: # Show first 5 as preview | ||
| self.stdout.write(f" - {assignment.title} (due: {assignment.date})") | ||
| if len(assignments) > 5: | ||
| self.stdout.write(f" ... and {len(assignments) - 5} more") | ||
| return processed, 0, 0, 0 | ||
|
|
||
| existing_due_locations: set = set( | ||
| when_models.ContentDate.objects.filter( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We shouldn't be reaching in to when's models directly. If there isn't an API that currently supports this, we should create it. |
||
| course_id=course_key, field="due" | ||
| ).values_list("location", flat=True) | ||
| ) | ||
|
|
||
| with transaction.atomic(): | ||
| for i in range(0, len(assignments), self.batch_size): | ||
| batch = assignments[i : i + self.batch_size] | ||
| batch_created, batch_updated, batch_skipped = ( | ||
| self._process_assignment_batch( | ||
| course_key, batch, existing_due_locations | ||
| ) | ||
| ) | ||
| created += batch_created | ||
| updated += batch_updated | ||
| skipped += batch_skipped | ||
|
|
||
| return processed, created, updated, skipped | ||
|
|
||
| def _process_assignment_batch( | ||
| self, | ||
| course_key: CourseKey, | ||
| assignments: list, | ||
| existing_due_locations: set, | ||
| ) -> tuple[int, int, int]: | ||
| """ | ||
| Process a batch of assignments and return (created, updated, skipped) counts. | ||
|
|
||
| existing_due_locations is updated in place when new ContentDate rows are created. | ||
| """ | ||
| created = 0 | ||
| updated = 0 | ||
| skipped = 0 | ||
|
|
||
| for assignment in assignments: | ||
kyrylo-kh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| self.stdout.write( | ||
| f"Processing assignment: {assignment.block_key}/{assignment.assignment_type} (due: {assignment.date})" | ||
| ) | ||
| already_exists = assignment.block_key in existing_due_locations | ||
|
|
||
| if already_exists and not self.force_update: | ||
| skipped += 1 | ||
| log.info( | ||
| f"Skipping existing ContentDate for {assignment.title} " | ||
| f"in course {course_key}" | ||
| ) | ||
| continue | ||
|
|
||
| try: | ||
| update_or_create_assignments_due_dates(course_key, [assignment]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why aren't we using the batch capability of the imported function rather than running this in a loop with single item lists? |
||
| existing_due_locations.add(assignment.block_key) | ||
|
|
||
| if already_exists: | ||
| updated += 1 | ||
| log.info( | ||
| f"Updated ContentDate for {assignment.title} " | ||
| f"in course {course_key}" | ||
| ) | ||
| else: | ||
| created += 1 | ||
| log.info( | ||
| f"Created ContentDate for {assignment.title} " | ||
| f"in course {course_key}" | ||
| ) | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this broad catch and continue? I think this is problematic because Django, I believe, will mark the atomic transaction that you created above as invalid and all of the updates will ultimately fail. If the idea is to make this save as much as possible, you will have to change the transaction boundaries. |
||
| except Exception as e: # pylint: disable=broad-exception-caught | ||
| log.error( | ||
| f"Failed to process assignment {assignment.title} " | ||
| f"in course {course_key}: {str(e)}" | ||
| ) | ||
| continue | ||
|
|
||
| return created, updated, skipped | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure of the current performance of this, but passing it as an argument seems like the right thing to do.