-
Notifications
You must be signed in to change notification settings - Fork 382
Bug 1944375 - Improve data cycling/deletion for performance datum replicates #9136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,13 +79,44 @@ def max_timestamp(self): | |
| return self._max_timestamp | ||
|
|
||
| def remove(self, using: CursorWrapper): | ||
| """ | ||
| Raw SQL is used here to avoid Django ORM cascade deletion, which can cause | ||
| database overload on large related tables (e.g. performance_datum_replicate). | ||
| """ | ||
| chunk_size = self._find_ideal_chunk_size() | ||
| deleted, _ = PerformanceDatum.objects.filter( | ||
| id__in=PerformanceDatum.objects.filter( | ||
| push_timestamp__lte=self._max_timestamp | ||
| ).values_list("id")[:chunk_size] | ||
| ).delete() | ||
| using.rowcount = deleted | ||
| using.execute( | ||
| """ | ||
| WITH target_datum AS ( | ||
| SELECT pd.id, pd.push_timestamp | ||
| FROM performance_datum pd | ||
| WHERE pd.push_timestamp <= %s | ||
| ORDER BY pd.push_timestamp | ||
| LIMIT %s | ||
| ), | ||
| del_replicate AS ( | ||
| DELETE FROM performance_datum_replicate r1 | ||
| WHERE r1.performance_datum_id IN ( | ||
| SELECT td.id | ||
| FROM target_datum td | ||
| WHERE td.push_timestamp <= %s | ||
| AND EXISTS ( | ||
| SELECT 1 | ||
| FROM performance_datum_replicate r2 | ||
| WHERE r2.performance_datum_id = td.id | ||
| ) | ||
| ) | ||
| ), | ||
| del_multi AS ( | ||
| DELETE FROM perf_multicommitdatum pm | ||
| USING target_datum td | ||
| WHERE pm.perf_datum_id = td.id | ||
| ) | ||
| DELETE FROM performance_datum pd | ||
| USING target_datum td | ||
| WHERE pd.id = td.id | ||
| """, | ||
| [self._max_timestamp, chunk_size, self._max_timestamp], | ||
| ) | ||
|
|
||
| @property | ||
| def name(self) -> str: | ||
|
|
@@ -175,14 +206,50 @@ def name(self) -> str: | |
| return "try data removal strategy" | ||
|
|
||
| def __attempt_remove(self, using): | ||
| deleted, _ = PerformanceDatum.objects.filter( | ||
| id__in=PerformanceDatum.objects.filter( | ||
| repository_id=self.try_repo, | ||
| push_timestamp__lte=self._max_timestamp, | ||
| signature_id__in=self.target_signatures, | ||
| ).values_list("id")[: self._chunk_size] | ||
| ).delete() | ||
| using.rowcount = deleted | ||
| using.execute( | ||
| """ | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use f-strings for these instead of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that f-strings can look a bit cleaner and more readable :)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh excellent point! I missed that we're not using a |
||
| WITH target_datum AS ( | ||
| SELECT pd.id, pd.repository_id, pd.push_timestamp, pd.signature_id | ||
| FROM performance_datum pd | ||
| WHERE pd.repository_id = %s | ||
| AND pd.push_timestamp <= %s | ||
| AND pd.signature_id = ANY(%s) | ||
| LIMIT %s | ||
| ), | ||
| del_replicate AS ( | ||
| DELETE FROM performance_datum_replicate r1 | ||
| WHERE r1.performance_datum_id IN ( | ||
| SELECT td.id | ||
| FROM target_datum td | ||
| WHERE td.repository_id = %s | ||
| AND td.push_timestamp <= %s | ||
| AND td.signature_id = ANY(%s) | ||
| AND EXISTS ( | ||
| SELECT 1 | ||
| FROM performance_datum_replicate r2 | ||
| WHERE r2.performance_datum_id = td.id | ||
| ) | ||
| ) | ||
| ), | ||
| del_multi AS ( | ||
| DELETE FROM perf_multicommitdatum pm | ||
| USING target_datum td | ||
| WHERE pm.perf_datum_id = td.id | ||
| ) | ||
| DELETE FROM performance_datum pd | ||
| USING target_datum td | ||
| WHERE pd.id = td.id | ||
| """, | ||
| [ | ||
| self.try_repo, | ||
| self._max_timestamp, | ||
| list(self.target_signatures), | ||
| self._chunk_size, | ||
| self.try_repo, | ||
| self._max_timestamp, | ||
| list(self.target_signatures), | ||
| ], | ||
| ) | ||
|
|
||
| def __lookup_new_signature(self): | ||
| self.__target_signatures = self.__try_signatures[: self.SIGNATURE_BULK_SIZE] | ||
|
|
@@ -246,12 +313,41 @@ def name(self) -> str: | |
|
|
||
| def remove(self, using: CursorWrapper): | ||
| chunk_size = self._find_ideal_chunk_size() | ||
| deleted, _ = PerformanceDatum.objects.filter( | ||
| id__in=PerformanceDatum.objects.filter( | ||
| repository_id=self.irrelevant_repo, push_timestamp__lte=self._max_timestamp | ||
| ).values_list("id")[:chunk_size] | ||
| ).delete() | ||
| using.rowcount = deleted | ||
| repository_id = self.irrelevant_repo | ||
| using.execute( | ||
| """ | ||
| WITH target_datum AS ( | ||
| SELECT pd.id, pd.repository_id, pd.push_timestamp | ||
| FROM performance_datum pd | ||
| WHERE pd.repository_id = %s | ||
| AND pd.push_timestamp <= %s | ||
| LIMIT %s | ||
| ), | ||
| del_replicate AS ( | ||
| DELETE FROM performance_datum_replicate r1 | ||
| WHERE r1.performance_datum_id IN ( | ||
| SELECT td.id | ||
| FROM target_datum td | ||
| WHERE td.repository_id = %s | ||
| AND td.push_timestamp <= %s | ||
| AND EXISTS ( | ||
| SELECT 1 | ||
| FROM performance_datum_replicate r2 | ||
| WHERE r2.performance_datum_id = td.id | ||
| ) | ||
| ) | ||
| ), | ||
| del_multi AS ( | ||
| DELETE FROM perf_multicommitdatum pm | ||
| USING target_datum td | ||
| WHERE pm.perf_datum_id = td.id | ||
| ) | ||
| DELETE FROM performance_datum pd | ||
| USING target_datum td | ||
| WHERE pd.id = td.id | ||
| """, | ||
| [repository_id, self._max_timestamp, chunk_size, repository_id, self._max_timestamp], | ||
| ) | ||
|
|
||
| def _find_ideal_chunk_size(self) -> int: | ||
| max_id_of_non_expired_row = ( | ||
|
|
@@ -343,14 +439,50 @@ def name(self) -> str: | |
| return "stalled data removal strategy" | ||
|
|
||
| def __attempt_remove(self, using: CursorWrapper): | ||
| deleted, _ = PerformanceDatum.objects.filter( | ||
| id__in=PerformanceDatum.objects.filter( | ||
| repository_id=self.target_signature.repository_id, | ||
| signature_id=self.target_signature.id, | ||
| push_timestamp__lte=self._max_timestamp, | ||
| ).values_list("id")[: self._chunk_size] | ||
| ).delete() | ||
| using.rowcount = deleted | ||
| using.execute( | ||
| """ | ||
| WITH target_datum AS ( | ||
| SELECT pd.id, pd.repository_id, pd.signature_id, pd.push_timestamp | ||
| FROM performance_datum pd | ||
| WHERE pd.repository_id = %s | ||
| AND pd.signature_id = %s | ||
| AND pd.push_timestamp <= %s | ||
| LIMIT %s | ||
| ), | ||
| del_replicate AS ( | ||
| DELETE FROM performance_datum_replicate r1 | ||
| WHERE r1.performance_datum_id IN ( | ||
| SELECT td.id | ||
| FROM target_datum td | ||
| WHERE td.repository_id = %s | ||
| AND td.signature_id = %s | ||
| AND td.push_timestamp <= %s | ||
| AND EXISTS ( | ||
| SELECT 1 | ||
| FROM performance_datum_replicate r2 | ||
| WHERE r2.performance_datum_id = td.id | ||
| ) | ||
| ) | ||
| ), | ||
| del_multi AS ( | ||
| DELETE FROM perf_multicommitdatum pm | ||
| USING target_datum td | ||
| WHERE pm.perf_datum_id = td.id | ||
| ) | ||
| DELETE FROM performance_datum pd | ||
| USING target_datum td | ||
| WHERE pd.id = td.id | ||
| """, | ||
| [ | ||
| self.target_signature.repository_id, | ||
| self.target_signature.id, | ||
| self._max_timestamp, | ||
| self._chunk_size, | ||
| self.target_signature.repository_id, | ||
| self.target_signature.id, | ||
| self._max_timestamp, | ||
| ], | ||
| ) | ||
|
|
||
| def __lookup_new_signature(self): | ||
| try: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add comments somewhere around the execute call to provide an explanation about what each part of the query does? We should do the same for all the other queries.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I’ll add comments around the execute call to explain what each part of the query does, and mirror that for the other queries as well.