Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/+safe-in-parameter-limit.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid exceeding PostgreSQL's 65,535 query parameter limit when filtering by large lists of IDs. This fixes `OperationalError` crashes during large import and copy operations involving more than 65,535 content units.
1 change: 1 addition & 0 deletions CHANGES/plugin_api/+safe-in-parameter-limit.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added `safe_in()` to the plugin API for building `Q` objects that are safe for arbitrarily large value lists, avoiding PostgreSQL's 65,535 query parameter limit.
35 changes: 14 additions & 21 deletions pulpcore/app/models/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
get_prn,
get_view_name_for_model,
reverse,
safe_in,
)
from pulpcore.cache import Cache
from pulpcore.constants import ALL_KNOWN_CONTENT_CHECKSUMS, PROTECTED_REPO_VERSION_MESSAGE
Expand Down Expand Up @@ -988,7 +989,7 @@ def get_content(self, content_qs=None):
Args:
content_qs (django.db.models.QuerySet): The queryset for Content that will be
restricted further to the content present in this repository version. If not given,
``Content.objects.all()`` is used (to return over all content types present in the
`Content.objects.all()` is used (to return over all content types present in the
repository version).

Returns:
Expand All @@ -1004,15 +1005,7 @@ def get_content(self, content_qs=None):
if content_qs is None:
content_qs = Content.objects

content_ids = self.content_ids
if len(content_ids) >= 65535:
# Workaround for PostgreSQL's limit on the number of parameters in a query
content_ids = (
RepositoryVersion.objects.filter(pk=self.pk)
.annotate(cids=Func(F("content_ids"), function="unnest"))
.values_list("cids", flat=True)
)
return content_qs.filter(pk__in=content_ids)
return content_qs.filter(safe_in("pk", self.content_ids))

@property
def content(self):
Expand Down Expand Up @@ -1056,14 +1049,14 @@ def content_batch_qs(self, content_qs=None, order_by_params=("pk",), batch_size=
Args:
content_qs (django.db.models.QuerySet) The queryset for Content that will be
restricted further to the content present in this repository version. If not given,
``Content.objects.all()`` is used (to iterate over all content present in the
`Content.objects.all()` is used (to iterate over all content present in the
repository version). A plugin may want to use a specific subclass of
[pulpcore.plugin.models.Content][] or use e.g. ``filter()`` to select
[pulpcore.plugin.models.Content][] or use e.g. `filter()` to select
a subset of the repository version's content.
order_by_params (tuple of str): The parameters for the ``order_by`` clause
for the content. The Default is ``("pk",)``. This needs to
order_by_params (tuple of str): The parameters for the `order_by` clause
for the content. The Default is `("pk",)`. This needs to
specify a stable order. For example, if you want to iterate by
decreasing creation time stamps use ``("-pulp_created", "pk")`` to
decreasing creation time stamps use `("-pulp_created", "pk")` to
ensure that content records are still sorted by primary key even
if their creation timestamp happens to be equal.
batch_size (int): The maximum batch size.
Expand All @@ -1072,8 +1065,8 @@ def content_batch_qs(self, content_qs=None, order_by_params=("pk",), batch_size=
[django.db.models.QuerySet][]: A QuerySet representing a slice of the content.

Example:
The following code could be used to loop over all ``FileContent`` in
``repository_version``. It prefetches the related
The following code could be used to loop over all `FileContent` in
`repository_version`. It prefetches the related
[pulpcore.plugin.models.ContentArtifact][] instances for every batch::

repository_version = ...
Expand Down Expand Up @@ -1126,8 +1119,8 @@ def added(self, base_version=None):
if not base_version:
return Content.objects.filter(version_memberships__version_added=self)

return Content.objects.filter(pk__in=self.content_ids).exclude(
pk__in=base_version.content_ids
return Content.objects.filter(safe_in("pk", self.content_ids)).exclude(
safe_in("pk", base_version.content_ids)
)

def removed(self, base_version=None):
Expand All @@ -1141,8 +1134,8 @@ def removed(self, base_version=None):
if not base_version:
return Content.objects.filter(version_memberships__version_removed=self)

return Content.objects.filter(pk__in=base_version.content_ids).exclude(
pk__in=self.content_ids
return Content.objects.filter(safe_in("pk", base_version.content_ids)).exclude(
safe_in("pk", self.content_ids)
)

def contains(self, content):
Expand Down
5 changes: 3 additions & 2 deletions pulpcore/app/tasks/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
compute_file_hash,
get_domain,
get_domain_pk,
safe_in,
)
from pulpcore.constants import TASK_STATES
from pulpcore.exceptions.plugin import MissingPlugin
Expand Down Expand Up @@ -410,14 +411,14 @@ def import_repository_version(
for repo_name, content_ids in mapping.items():
repo_name = _get_destination_repo_name(importer, repo_name)
dest_repo = Repository.objects.get(name=repo_name)
content = Content.objects.filter(upstream_id__in=content_ids)
content = Content.objects.filter(safe_in("upstream_id", content_ids))
content_count += len(content_ids)
with dest_repo.new_version() as new_version:
new_version.set_content(content)
else:
# just map all the content to our destination repo
dest_repo = Repository.objects.get(pk=dest_repo_pk)
content = Content.objects.filter(pk__in=resulting_content_ids)
content = Content.objects.filter(safe_in("pk", resulting_content_ids))
content_count += len(resulting_content_ids)
with dest_repo.new_version() as new_version:
new_version.set_content(content)
Expand Down
43 changes: 42 additions & 1 deletion pulpcore/app/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from django.apps import apps
from django.conf import settings
from django.db import connection
from django.db.models import Model, UUIDField
from django.db.models import Field, Lookup, Model, Q, UUIDField
from rest_framework.reverse import reverse as drf_reverse
from rest_framework.serializers import ValidationError

Expand All @@ -26,6 +26,47 @@
from pulpcore.app.loggers import deprecation_logger
from pulpcore.exceptions.validation import InvalidSignatureError

POSTGRES_MAX_QUERY_PARAMS = 65535


class AnyArray(Lookup):
"""PostgreSQL `= ANY(%s)` lookup that passes a list as a single array parameter.

psycopg3 adapts the Python list into a PostgreSQL array, so the entire list
counts as **one** bind parameter regardless of size. This avoids the
protocol-level 65535-parameter limit that `IN ($1, $2, …)` hits.
"""

lookup_name = "any_array"

def get_prep_lookup(self):
return [self.lhs.output_field.get_prep_value(v) for v in self.rhs]

def as_sql(self, compiler, connection):
lhs, lhs_params = self.process_lhs(compiler, connection)
return f"{lhs} = ANY(%s)", lhs_params + [list(self.rhs)]


Field.register_lookup(AnyArray)


def safe_in(field_name, values):
"""Build a `Q` object for `field__in` that is safe for arbitrarily large lists.

* If *values* is already a queryset (or other non-collection type), the
normal `__in` lookup is used — Django turns it into a subquery.
* If the collection has fewer than 65 535 items, `__in` is used as-is.
* Otherwise `__any_array` is used so the whole list travels as a single
PostgreSQL array parameter.
"""
if not isinstance(values, (list, set, tuple, frozenset)):
return Q(**{f"{field_name}__in": values})
values = list(values)
if len(values) < POSTGRES_MAX_QUERY_PARAMS:
return Q(**{f"{field_name}__in": values})
return Q(**{f"{field_name}__any_array": values})

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would there be a downside when we always used this array method?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was something I was wanting to investigate a bit more before undrafting. This is draft because it's pretty much just "what Claude said" and I wanted to at least look at some query plans and compare before going with it.

I was trying to make a reliable unit test as well but, unfortunately, the unit test in #7801 is not reliable for reasons that are not entirely clear to me.

The two SQL strategies are:

IN ($1, $2, ..., $N) — N separate bind parameters, one per value
= ANY($1) — one bind parameter containing a PostgreSQL array
Parsing/planning: With IN, PostgreSQL must parse N parameter placeholders and the planner builds an OR-tree of comparison nodes. At 100K values, that's significant parse time and planner overhead. With = ANY(array), the query structure is always the same single ScalarArrayOpExpr node regardless of list size.

Prepared statement caching: IN produces a different query shape for each different N, so the plan can't be reused across different list sizes. = ANY(array) always has the same shape — one parameter — so the plan is reusable.

Index usage: Both use btree indexes equally well.

Small lists: For tiny lists (1-10 items), IN is marginally cheaper because there's no array construction. The difference is negligible in practice.

So why not always use = ANY?

There's no strong PostgreSQL-level reason not to. The threshold in safe_in() is mostly conservatism:

__in is Django's standard, battle-tested lookup — it handles querysets (subqueries), empty lists, None values, and all the edge cases Django has polished over years. A custom lookup is more code to maintain.
__in works across all database backends. = ANY(array) is PostgreSQL-specific.
For Pulp specifically (always PostgreSQL), you could always use = ANY for Python lists and it'd be fine.
If you want to simplify, you could drop the threshold and always use any_array for concrete lists. The threshold just avoids the custom path when there's no benefit.



# a little cache so viewset_for_model doesn't have to iterate over every app every time
_model_viewset_cache = {}

Expand Down
2 changes: 2 additions & 0 deletions pulpcore/plugin/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
raise_for_unknown_content_units,
resolve_prn,
reverse,
safe_in,
set_current_user,
set_domain,
)
Expand Down Expand Up @@ -59,5 +60,6 @@
"reverse",
"set_current_user",
"resolve_prn",
"safe_in",
"cache_key",
]
3 changes: 3 additions & 0 deletions pulpcore/tests/unit/models/test_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,9 @@ def test_postgresql_parameter_limit(db, repository):
PostgreSQL limits queries to 65535 parameters. This test verifies that content, added(),
and removed() all handle >65535 items correctly.

The safe_in() utility (used internally) avoids this limit by using ``= ANY(%s)``
(a single array parameter) for large value lists.

Queries MUST be evaluated via .iterator() because psycopg3 uses client-side binding for
regular queries (inlining params into the SQL string, bypassing the limit) but server-side
binding for server-side cursors (.iterator()), which enforces the 65,535 parameter cap.
Expand Down
Loading