Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion apps/commons/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import itertools
import re
import uuid
from typing import Optional
from io import BytesIO
from typing import List, Optional, Tuple

import pymupdf
from bs4 import BeautifulSoup
from django.conf import settings
from django.contrib.postgres.fields import ArrayField
Expand Down Expand Up @@ -137,3 +139,27 @@ def map_action_to_permission(action: str, codename: str) -> Optional[str]:
"partial_update": f"change_{codename}",
"destroy": f"delete_{codename}",
}.get(action, None)


def extract_text_from_pdf_page(pdf_data: pymupdf.Document, page: int) -> str:
return pdf_data[page].get_text()


def extract_images_from_pdf_page(
pdf_data: pymupdf.Document, page: int
) -> List[BytesIO]:
images = []
extracted = pdf_data[page].get_images() or []
for image in extracted:
data = pdf_data.extract_image(image[0])
images.append(BytesIO(data.get("image")))
return images


def extract_pdf_data(pdf_data: pymupdf.Document) -> Tuple[str, List[BytesIO]]:
images = []
text = ""
for page in range(pdf_data.page_count):
images += extract_images_from_pdf_page(pdf_data, page)
text += extract_text_from_pdf_page(pdf_data, page)
return text, images
34 changes: 34 additions & 0 deletions apps/projects/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from typing import Any, Dict, Tuple, TypeVar

import pymupdf
from rest_framework import serializers
from rest_framework.utils import model_meta

from apps.commons.utils import extract_pdf_data
from apps.organizations.models import Organization
from services.mistral.interface import MistralService

from .models import Project

Expand Down Expand Up @@ -61,3 +64,34 @@ def compute_project_changes(
changes[attr] = (old, new)

return changes


def create_project_from_pdf(pdf_data: pymupdf.Document) -> Project:
"""Create a Project instance from a PDF file.

Parameters
----------
pdf_data : pymupdf.Document
The PDF file to be converted into a Project instance.

Returns
-------
Project
The Project instance created from the PDF file.
"""
text, images = extract_pdf_data(pdf_data)
system = [
"CONTEXT : Our user provides a PDF file with the following extracted text.",
"OBJECTIVE : We want to turn that text into a project's description.",
"STYLE: Similar to the one used in the original text.",
"LANGUAGE: Same as the original text.",
"AUDIENCE : People that don't know the project and will want to learn about it.",
"""
RESPONSE : A json object with the following keys:
- title (str): The title of the project.
- description (str): The description of the project.
""",
"IMPORTANT : DO NOT MAKE UP ANY FACTS, EVEN IF IT MEANS RETURNING VERY LITTLE INFORMATION.",
]
prompt = [text]
return MistralService.get_json_chat_response(system=system, prompt=prompt)
380 changes: 222 additions & 158 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ parameterized = "^0.9.0"
django-prometheus = "^2.3.1"
pymediawiki = "^0.7.4"
pgvector = "^0.2.4"
mistralai = "^0.0.12"
mistralai = "^1.1.0"
setuptools = "^74.1.1"
pymupdf = "^1.24.10"

[tool.poetry.dev-dependencies]
bandit = "^1.7.3"
Expand Down
38 changes: 28 additions & 10 deletions services/mistral/interface.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,50 @@
from typing import List
import json
from typing import Any, Dict, List

from django.conf import settings
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from mistralai import Mistral


class MistralService:
service = MistralClient(api_key=settings.MISTRAL_API_KEY)
service = Mistral(api_key=settings.MISTRAL_API_KEY)

@classmethod
def get_chat_response(cls, system: List[str], prompt: List[str], **kwargs) -> str:
"""
Get the chat response from Mistral API.
"""
system = [ChatMessage(role="system", content=message) for message in system]
prompt = [ChatMessage(role="user", content=message) for message in prompt]
response = cls.service.chat(
model="mistral-small", messages=system + prompt, **kwargs
messages = [
*[{"content": message, "role": "system"} for message in system],
*[{"content": message, "role": "user"} for message in prompt],
]
response = cls.service.chat.complete(
model="mistral-small", messages=messages, **kwargs
)
return "\n".join([choice.message.content for choice in response.choices])

@classmethod
def get_json_chat_response(
cls, system: List[str], prompt: List[str], **kwargs
) -> Dict[str, Any]:
messages = [
*[{"content": message, "role": "system"} for message in system],
*[{"content": message, "role": "user"} for message in prompt],
]
response = cls.service.chat.complete(
model="mistral-small",
messages=messages,
response_format={"type": "json_object"},
**kwargs
)
return json.loads(response.choices[0].message.content)

@classmethod
def get_embedding(cls, prompt: str) -> List[float]:
"""
Get the prompt's vector in 1024 dimensions from Mistral API.
"""
response = cls.service.embeddings(
response = cls.service.embeddings.create(
model="mistral-embed",
input=[prompt],
inputs=[prompt],
)
return response.data[0].embedding
18 changes: 9 additions & 9 deletions services/mistral/testcases.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from typing import List

from faker import Faker
from mistralai.models.chat_completion import (
from mistralai.models import (
AssistantMessage,
ChatCompletionChoice,
ChatCompletionResponse,
ChatCompletionResponseChoice,
ChatMessage,
FinishReason,
EmbeddingResponse,
EmbeddingResponseData,
UsageInfo,
)
from mistralai.models.embeddings import EmbeddingObject, EmbeddingResponse

faker = Faker()

Expand All @@ -22,13 +22,13 @@ def chat_response_mocked_return(self, messages: List[str]):
created=int(datetime.now().timestamp()),
model="mistral-small",
choices=[
ChatCompletionResponseChoice(
ChatCompletionChoice(
index=0,
message=ChatMessage(
message=AssistantMessage(
role="assistant",
content=message,
),
finish_reason=FinishReason.stop,
finish_reason="stop",
)
for message in messages
],
Expand All @@ -45,7 +45,7 @@ def embedding_response_mocked_return(self, embedding: List[float]):
object="list",
model="mistral-embed",
data=[
EmbeddingObject(
EmbeddingResponseData(
object="embedding",
embedding=embedding,
index=0,
Expand Down
4 changes: 2 additions & 2 deletions services/mistral/tests/interface/test_mistral_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class MistralServiceTestCase(JwtAPITestCase, MistralTestCaseMixin):
@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.chat.complete")
def test_get_chat_response(self, mocked):
messages = [faker.sentence() for _ in range(3)]
mocked.return_value = self.chat_response_mocked_return(messages)
Expand All @@ -20,7 +20,7 @@ def test_get_chat_response(self, mocked):
)
self.assertEqual(response, "\n".join(messages))

@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_get_embedding(self, mocked):
embedding = [faker.pyfloat(min_value=0, max_value=1) for _ in range(1024)]
mocked.return_value = self.embedding_response_mocked_return(embedding)
Expand Down
12 changes: 6 additions & 6 deletions services/mistral/tests/models/test_project_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def setUpTestData(cls) -> None:
super().setUpTestData()
cls.organization = OrganizationFactory()

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_with_description(self, mocked_embeddings, mocked_chat):
project = ProjectFactory(
description=faker.text(), organizations=[self.organization]
Expand All @@ -62,8 +62,8 @@ def test_vectorize_with_description(self, mocked_embeddings, mocked_chat):
self.assertEqual(embedding.embedding, vector)
self.assertNotEqual(embedding.prompt_hashcode, "")

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_with_blog_entries(self, mocked_embeddings, mocked_chat):
project = ProjectFactory(description="", organizations=[self.organization])
embedding = ProjectEmbeddingFactory(item=project)
Expand All @@ -77,8 +77,8 @@ def test_vectorize_with_blog_entries(self, mocked_embeddings, mocked_chat):
self.assertEqual(embedding.embedding, vector)
self.assertNotEqual(embedding.prompt_hashcode, "")

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_not_visible(self, mocked_embeddings, mocked_chat):
project = ProjectFactory(description="", organizations=[self.organization])
embedding = ProjectEmbeddingFactory(item=project)
Expand Down
4 changes: 2 additions & 2 deletions services/mistral/tests/models/test_user_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def setUpTestData(cls) -> None:
super().setUpTestData()
cls.organization = OrganizationFactory()

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_user_embedding(self, mocked_embeddings, mocked_chat):
project_1 = ProjectFactory(organizations=[self.organization])
project_2 = ProjectFactory(organizations=[self.organization])
Expand Down
16 changes: 8 additions & 8 deletions services/mistral/tests/models/test_user_profile_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def test_set_visibility_not_visible(self):


class VectorizeUserProfileTestCase(JwtAPITestCase, MistralTestCaseMixin):
@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_with_personal_description(self, mocked_embeddings, mocked_chat):
user = UserFactory(
personal_description=faker.text(), professional_description=""
Expand All @@ -59,8 +59,8 @@ def test_vectorize_with_personal_description(self, mocked_embeddings, mocked_cha
self.assertEqual(embedding.embedding, vector)
self.assertNotEqual(embedding.prompt_hashcode, "")

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_with_professional_description(
self, mocked_embeddings, mocked_chat
):
Expand All @@ -81,8 +81,8 @@ def test_vectorize_with_professional_description(
self.assertEqual(embedding.embedding, vector)
self.assertNotEqual(embedding.prompt_hashcode, "")

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_with_skills(self, mocked_embeddings, mocked_chat):
user = UserFactory(personal_description="", professional_description="")
embedding = UserProfileEmbeddingFactory(item=user)
Expand All @@ -96,8 +96,8 @@ def test_vectorize_with_skills(self, mocked_embeddings, mocked_chat):
self.assertEqual(embedding.embedding, vector)
self.assertNotEqual(embedding.prompt_hashcode, "")

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_vectorize_not_visible(self, mocked_embeddings, mocked_chat):
user = UserFactory(personal_description="", professional_description="")
embedding = UserProfileEmbeddingFactory(item=user)
Expand Down
8 changes: 4 additions & 4 deletions services/mistral/tests/views/test_recommended_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ def test_user_recommended_random_users(self, role, retrieved_users):
[self.users[user].id for user in retrieved_users[:2]],
)

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_get_recommended_users_create_embedding_vector(
self, mocked_embeddings, mocked_chat
):
Expand All @@ -277,8 +277,8 @@ def test_get_recommended_users_create_embedding_vector(
[self.users[user].id for user in ["org", "private", "public_2", "public"]],
)

@patch("services.mistral.interface.MistralService.service.chat")
@patch("services.mistral.interface.MistralService.service.embeddings")
@patch("services.mistral.interface.MistralService.service.chat.complete")
@patch("services.mistral.interface.MistralService.service.embeddings.create")
def test_get_recommended_users_create_embedding_object(
self, mocked_embeddings, mocked_chat
):
Expand Down