Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to
### Changed

- ♿️(frontend) Focus main container after navigation #1854
- 🚸(backend) sort user search results by proximity with the active user #1802


### Fixed
Expand Down
1 change: 1 addition & 0 deletions docs/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ These are the environment variables you can set for the `impress-backend` contai
| API_USERS_LIST_LIMIT | Limit on API users | 5 |
| API_USERS_LIST_THROTTLE_RATE_BURST | Throttle rate for api on burst | 30/minute |
| API_USERS_LIST_THROTTLE_RATE_SUSTAINED | Throttle rate for api | 180/hour |
| API_USERS_SEARCH_QUERY_MIN_LENGTH | Minimum characters to insert to search a user | 3 |
| AWS_S3_ACCESS_KEY_ID | Access id for s3 endpoint | |
| AWS_S3_ENDPOINT_URL | S3 endpoint | |
| AWS_S3_REGION_NAME | Region name for s3 endpoint | |
Expand Down
5 changes: 4 additions & 1 deletion src/backend/core/api/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import unicodedata

from django.conf import settings
from django.utils.translation import gettext_lazy as _

import django_filters
Expand Down Expand Up @@ -135,4 +136,6 @@ class UserSearchFilter(django_filters.FilterSet):
Custom filter for searching users.
"""

q = django_filters.CharFilter(min_length=5, max_length=254)
q = django_filters.CharFilter(
min_length=settings.API_USERS_SEARCH_QUERY_MIN_LENGTH, max_length=254
)
78 changes: 73 additions & 5 deletions src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection
from lasuite.oidc_login.decorators import refresh_oidc_access_token
from lasuite.tools.email import get_domain_from_email
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
Expand All @@ -61,7 +62,11 @@
get_visited_document_ids_of,
)
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
from core.utils import (
extract_attachments,
filter_descendants,
users_sharing_documents_with,
)

from . import permissions, serializers, utils
from .filters import DocumentFilter, ListDocumentFilter, UserSearchFilter
Expand Down Expand Up @@ -220,18 +225,80 @@ def get_queryset(self):

# Use trigram similarity for non-email-like queries
# For performance reasons we filter first by similarity, which relies on an
# index, then only calculate precise similarity scores for sorting purposes

return (
# index, then only calculate precise similarity scores for sorting purposes.
#
# Additionally results are reordered to prefer users "closer" to the current
# user: users they recently shared documents with, then same email domain.
# To achieve that without complex SQL, we build a proximity score in Python
# and return the top N results.
# For security results, users that match neither of these proximity criteria
# are not returned at all, to prevent email enumeration.
current_user = self.request.user
shared_map = users_sharing_documents_with(current_user)

user_email_domain = get_domain_from_email(current_user.email) or ""

candidates = list(
queryset.annotate(
sim_email=TrigramSimilarity("email", query),
sim_name=TrigramSimilarity("full_name", query),
)
.annotate(similarity=Greatest("sim_email", "sim_name"))
.filter(similarity__gt=0.2)
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
.order_by("-similarity")
)

# Keep only users that either share documents with the current user
# or have an email with the same domain as the current user.
filtered_candidates = []
for u in candidates:
candidate_domain = get_domain_from_email(u.email) or ""
if shared_map.get(u.id) or (
user_email_domain and candidate_domain == user_email_domain
):
filtered_candidates.append(u)

candidates = filtered_candidates

# Build ordering key for each candidate
def _sort_key(u):
# shared priority: most recent first
# Use shared_last_at timestamp numeric for secondary ordering when shared.
shared_last_at = shared_map.get(u.id)
if shared_last_at:
is_shared = 1
shared_score = int(shared_last_at.timestamp())
else:
is_shared = 0
shared_score = 0

# domain proximity
candidate_email_domain = get_domain_from_email(u.email) or ""

same_full_domain = (
1
if candidate_email_domain
and candidate_email_domain == user_email_domain
else 0
)

# similarity fallback
sim = getattr(u, "similarity", 0) or 0

return (
is_shared,
shared_score,
same_full_domain,
sim,
)

# Sort candidates by the key descending and return top N as a queryset-like
# list. Keep return type consistent with previous behavior (QuerySet slice
# was returned) by returning a list of model instances.
candidates.sort(key=_sort_key, reverse=True)

return candidates[: settings.API_USERS_LIST_LIMIT]

@drf.decorators.action(
detail=False,
methods=["get"],
Expand Down Expand Up @@ -2338,6 +2405,7 @@ def get(self, request):
"""
array_settings = [
"AI_FEATURE_ENABLED",
"API_USERS_SEARCH_QUERY_MIN_LENGTH",
"COLLABORATION_WS_URL",
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY",
"CONVERSION_FILE_EXTENSIONS_ALLOWED",
Expand Down
22 changes: 20 additions & 2 deletions src/backend/core/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

from functools import partial

from django.core.cache import cache
from django.db import transaction
from django.db.models import signals
from django.dispatch import receiver

from . import models
from .tasks.search import trigger_batch_document_indexer
from core import models
from core.tasks.search import trigger_batch_document_indexer
from core.utils import get_users_sharing_documents_with_cache_key


@receiver(signals.post_save, sender=models.Document)
Expand All @@ -26,8 +28,24 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
Clear cache for the affected user.
"""
if not created:
transaction.on_commit(
partial(trigger_batch_document_indexer, instance.document)
)

# Invalidate cache for the user
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)


@receiver(signals.post_delete, sender=models.DocumentAccess)
def document_access_post_delete(sender, instance, **kwargs): # pylint: disable=unused-argument
"""
Clear cache for the affected user when document access is deleted.
"""
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)
2 changes: 2 additions & 0 deletions src/backend/core/tests/test_api_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

@override_settings(
AI_FEATURE_ENABLED=False,
API_USERS_SEARCH_QUERY_MIN_LENGTH=6,
COLLABORATION_WS_URL="http://testcollab/",
COLLABORATION_WS_NOT_CONNECTED_READY_ONLY=True,
CRISP_WEBSITE_ID="123",
Expand All @@ -44,6 +45,7 @@ def test_api_config(is_authenticated):
assert response.status_code == HTTP_200_OK
assert response.json() == {
"AI_FEATURE_ENABLED": False,
"API_USERS_SEARCH_QUERY_MIN_LENGTH": 6,
"COLLABORATION_WS_URL": "http://testcollab/",
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY": True,
"CONVERSION_FILE_EXTENSIONS_ALLOWED": [".docx", ".md"],
Expand Down
106 changes: 83 additions & 23 deletions src/backend/core/tests/test_api_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Test users API endpoints in the impress core app.
"""

from django.utils import timezone

import pytest
from rest_framework.test import APIClient

Expand Down Expand Up @@ -121,12 +123,12 @@ def test_api_users_list_query_full_name():
Authenticated users should be able to list users and filter by full name.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")

client = APIClient()
client.force_login(user)

dave = factories.UserFactory(email="contact@work.com", full_name="David Bowman")
dave = factories.UserFactory(email="contact@example.com", full_name="David Bowman")

response = client.get(
"/api/v1.0/users/?q=David",
Expand Down Expand Up @@ -166,13 +168,13 @@ def test_api_users_list_query_accented_full_name():
Authenticated users should be able to list users and filter by full name with accents.
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")

client = APIClient()
client.force_login(user)

fred = factories.UserFactory(
email="contact@work.com", full_name="Frédérique Lefèvre"
email="contact@example.com", full_name="Frédérique Lefèvre"
)

response = client.get("/api/v1.0/users/?q=Frédérique")
Expand Down Expand Up @@ -201,12 +203,82 @@ def test_api_users_list_query_accented_full_name():
assert users == []


def test_api_users_list_sorted_by_closest_match():
"""
Authenticated users should be able to list users and the results should be
sorted by closest match to the query.

Sorting criteria are :
- Shared documents with the user (most recent first)
- Same full email domain (example.gouv.fr)

Addresses that match neither criteria should be excluded from the results.

Case in point: the logged-in user has recently shared documents
with pierre.dupont@beta.gouv.fr and less recently with pierre.durand@impots.gouv.fr.

Other users named Pierre also exist:
- pierre.thomas@example.com
- pierre.petit@anct.gouv.fr
- pierre.robert@culture.gouv.fr

The search results should be ordered as follows:

# Shared with first
- pierre.dupond@beta.gouv.fr # Most recent first
- pierre.durand@impots.gouv.fr
# Same full domain second
- pierre.petit@anct.gouv.fr
"""

user = factories.UserFactory(
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
)

client = APIClient()
client.force_login(user)

pierre_1 = factories.UserFactory(email="pierre.dupont@beta.gouv.fr")
pierre_2 = factories.UserFactory(email="pierre.durand@impots.gouv.fr")
_pierre_3 = factories.UserFactory(email="pierre.thomas@example.com")
pierre_4 = factories.UserFactory(email="pierre.petit@anct.gouv.fr")
_pierre_5 = factories.UserFactory(email="pierre.robert@culture.gouv.fr")

document_1 = factories.DocumentFactory(creator=user)
document_2 = factories.DocumentFactory(creator=user)
factories.UserDocumentAccessFactory(user=user, document=document_1)
factories.UserDocumentAccessFactory(user=user, document=document_2)

now = timezone.now()
last_week = now - timezone.timedelta(days=7)
last_month = now - timezone.timedelta(days=30)

# The factory cannot set the created_at directly, so we force it after creation
p1_d1 = factories.UserDocumentAccessFactory(user=pierre_1, document=document_1)
p1_d1.created_at = last_week
p1_d1.save()

p2_d2 = factories.UserDocumentAccessFactory(user=pierre_2, document=document_2)
p2_d2.created_at = last_month
p2_d2.save()

response = client.get("/api/v1.0/users/?q=Pierre")
assert response.status_code == 200
user_ids = [user["email"] for user in response.json()]

assert user_ids == [
str(pierre_1.email),
str(pierre_2.email),
str(pierre_4.email),
]


def test_api_users_list_limit(settings):
"""
Authenticated users should be able to list users and the number of results
should be limited to 10.
should be limited to API_USERS_LIST_LIMIT (by default 5).
"""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")

client = APIClient()
client.force_login(user)
Expand Down Expand Up @@ -309,28 +381,16 @@ def test_api_users_list_query_email_exclude_doc_user():

def test_api_users_list_query_short_queries():
"""
Queries shorter than 5 characters should return an empty result set.
If API_USERS_SEARCH_QUERY_MIN_LENGTH is not set, the default minimum length should be 3.
"""
user = factories.UserFactory(email="paul@example.com", full_name="Paul")
client = APIClient()
client.force_login(user)

factories.UserFactory(email="john.doe@example.com")
factories.UserFactory(email="john.lennon@example.com")

response = client.get("/api/v1.0/users/?q=jo")
assert response.status_code == 400
assert response.json() == {
"q": ["Ensure this value has at least 5 characters (it has 2)."]
}

response = client.get("/api/v1.0/users/?q=john")
assert response.status_code == 400
assert response.json() == {
"q": ["Ensure this value has at least 5 characters (it has 4)."]
}
factories.UserFactory(email="john.doe@example.com", full_name="John Doe")
factories.UserFactory(email="john.lennon@example.com", full_name="John Lennon")

response = client.get("/api/v1.0/users/?q=john.")
response = client.get("/api/v1.0/users/?q=joh")
assert response.status_code == 200
assert len(response.json()) == 2

Expand All @@ -356,7 +416,7 @@ def test_api_users_list_query_long_queries():

def test_api_users_list_query_inactive():
"""Inactive users should not be listed."""
user = factories.UserFactory()
user = factories.UserFactory(email="user@example.com")
client = APIClient()
client.force_login(user)

Expand Down
Loading
Loading