Skip to content

Commit d15d609

Browse files
StephanMeijerAntoLC
authored andcommitted
✨(backend) Import of documents
We can now import documents in formats .docx and .md. To do so we added a new container "docspec", which uses the docspec service to convert these formats to Blocknote format. More here: #1567 #1569.
1 parent 1e37007 commit d15d609

File tree

12 files changed

+306
-110
lines changed

12 files changed

+306
-110
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ logs: ## display app-dev logs (follow mode)
213213
.PHONY: logs
214214

215215
run-backend: ## Start only the backend application and all needed services
216+
@$(COMPOSE) up --force-recreate -d docspec
216217
@$(COMPOSE) up --force-recreate -d celery-dev
217218
@$(COMPOSE) up --force-recreate -d y-provider-development
218219
@$(COMPOSE) up --force-recreate -d nginx

compose.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,8 @@ services:
217217
kc_postgresql:
218218
condition: service_healthy
219219
restart: true
220+
221+
docspec:
222+
image: ghcr.io/docspecio/api:2.0.0
223+
ports:
224+
- "4000:4000"

docs/env.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ These are the environment variables you can set for the `impress-backend` contai
103103
| USER_OIDC_ESSENTIAL_CLAIMS | Essential claims in OIDC token | [] |
104104
| Y_PROVIDER_API_BASE_URL | Y Provider url | |
105105
| Y_PROVIDER_API_KEY | Y provider API key | |
106+
| DOCSPEC_API_URL | URL to endpoint of DocSpec conversion API | |
106107

107108

108109
## impress-frontend image

env.d/development/common

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,5 +67,7 @@ DJANGO_SERVER_TO_SERVER_API_TOKENS=server-api-token
6767
Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/
6868
Y_PROVIDER_API_KEY=yprovider-api-key
6969

70+
DOCSPEC_API_URL=http://docspec:4000/conversion
71+
7072
# Theme customization
71-
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
73+
THEME_CUSTOMIZATION_CACHE_TIMEOUT=15

env.d/development/common.e2e

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ Y_PROVIDER_API_BASE_URL=http://y-provider:4444/api/
66

77
# Throttle
88
API_DOCUMENT_THROTTLE_RATE=1000/min
9-
API_CONFIG_THROTTLE_RATE=1000/min
9+
API_CONFIG_THROTTLE_RATE=1000/min

src/backend/core/api/serializers.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
from django.utils.text import slugify
1212
from django.utils.translation import gettext_lazy as _
1313

14+
from core.services import mime_types
1415
import magic
1516
from rest_framework import serializers
1617

1718
from core import choices, enums, models, utils, validators
1819
from core.services.ai_services import AI_ACTIONS
1920
from core.services.converter_services import (
2021
ConversionError,
21-
YdocConverter,
22+
Converter,
2223
)
2324

2425

@@ -188,6 +189,7 @@ class DocumentSerializer(ListDocumentSerializer):
188189

189190
content = serializers.CharField(required=False)
190191
websocket = serializers.BooleanField(required=False, write_only=True)
192+
file = serializers.FileField(required=False, write_only=True, allow_null=True)
191193

192194
class Meta:
193195
model = models.Document
@@ -204,6 +206,7 @@ class Meta:
204206
"deleted_at",
205207
"depth",
206208
"excerpt",
209+
"file",
207210
"is_favorite",
208211
"link_role",
209212
"link_reach",
@@ -461,7 +464,11 @@ def create(self, validated_data):
461464
language = user.language or language
462465

463466
try:
464-
document_content = YdocConverter().convert(validated_data["content"])
467+
document_content = Converter().convert(
468+
validated_data["content"],
469+
mime_types.MARKDOWN,
470+
mime_types.YJS
471+
)
465472
except ConversionError as err:
466473
raise serializers.ValidationError(
467474
{"content": ["Could not convert content"]}

src/backend/core/api/viewsets.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,12 @@
4040
from core.services.ai_services import AIService
4141
from core.services.collaboration_services import CollaborationService
4242
from core.services.converter_services import (
43+
ConversionError,
4344
ServiceUnavailableError as YProviderServiceUnavailableError,
44-
)
45-
from core.services.converter_services import (
4645
ValidationError as YProviderValidationError,
46+
Converter,
4747
)
48-
from core.services.converter_services import (
49-
YdocConverter,
50-
)
48+
from core.services import mime_types
5149
from core.tasks.mail import send_ask_for_access_mail
5250
from core.utils import extract_attachments, filter_descendants
5351

@@ -504,6 +502,28 @@ def perform_create(self, serializer):
504502
"IN SHARE ROW EXCLUSIVE MODE;"
505503
)
506504

505+
# Remove file from validated_data as it's not a model field
506+
# Process it if present
507+
uploaded_file = serializer.validated_data.pop("file", None)
508+
509+
# If a file is uploaded, convert it to Yjs format and set as content
510+
if uploaded_file:
511+
try:
512+
file_content = uploaded_file.read()
513+
514+
converter = Converter()
515+
converted_content = converter.convert(
516+
file_content,
517+
content_type=uploaded_file.content_type,
518+
accept=mime_types.YJS
519+
)
520+
serializer.validated_data["content"] = converted_content
521+
serializer.validated_data["title"] = uploaded_file.name
522+
except ConversionError as err:
523+
raise drf.exceptions.ValidationError(
524+
{"file": ["Could not convert file content"]}
525+
) from err
526+
507527
obj = models.Document.add_root(
508528
creator=self.request.user,
509529
**serializer.validated_data,
@@ -1603,14 +1623,14 @@ def content(self, request, pk=None):
16031623
if base64_content is not None:
16041624
# Convert using the y-provider service
16051625
try:
1606-
yprovider = YdocConverter()
1626+
yprovider = Converter()
16071627
result = yprovider.convert(
16081628
base64.b64decode(base64_content),
1609-
"application/vnd.yjs.doc",
1629+
mime_types.YJS,
16101630
{
1611-
"markdown": "text/markdown",
1612-
"html": "text/html",
1613-
"json": "application/json",
1631+
"markdown": mime_types.MARKDOWN,
1632+
"html": mime_types.HTML,
1633+
"json": mime_types.JSON,
16141634
}[content_format],
16151635
)
16161636
content = result

src/backend/core/services/converter_services.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
from django.conf import settings
66

77
import requests
8+
import typing
89

10+
from core.services import mime_types
911

1012
class ConversionError(Exception):
1113
"""Base exception for conversion-related errors."""
@@ -19,8 +21,65 @@ class ServiceUnavailableError(ConversionError):
1921
"""Raised when the conversion service is unavailable."""
2022

2123

24+
class ConverterProtocol(typing.Protocol):
25+
def convert(self, text, content_type, accept): ...
26+
27+
28+
class Converter:
29+
docspec: ConverterProtocol
30+
ydoc: ConverterProtocol
31+
32+
def __init__(self):
33+
self.docspec = DocSpecConverter()
34+
self.ydoc = YdocConverter()
35+
36+
def convert(self, input, content_type, accept):
37+
"""Convert input into other formats using external microservices."""
38+
39+
if content_type == mime_types.DOCX and accept == mime_types.YJS:
40+
return self.convert(
41+
self.docspec.convert(input, mime_types.DOCX, mime_types.BLOCKNOTE),
42+
mime_types.BLOCKNOTE,
43+
mime_types.YJS
44+
)
45+
46+
return self.ydoc.convert(input, content_type, accept)
47+
48+
49+
class DocSpecConverter:
50+
"""Service class for DocSpec conversion-related operations."""
51+
52+
def _request(self, url, data, content_type):
53+
"""Make a request to the DocSpec API."""
54+
55+
response = requests.post(
56+
url,
57+
headers={"Accept": mime_types.BLOCKNOTE},
58+
files={"file": ("document.docx", data, content_type)},
59+
timeout=settings.CONVERSION_API_TIMEOUT,
60+
verify=settings.CONVERSION_API_SECURE,
61+
)
62+
response.raise_for_status()
63+
return response
64+
65+
def convert(self, data, content_type, accept):
66+
"""Convert a Document to BlockNote."""
67+
if not data:
68+
raise ValidationError("Input data cannot be empty")
69+
70+
if content_type != mime_types.DOCX or accept != mime_types.BLOCKNOTE:
71+
raise ValidationError(f"Conversion from {content_type} to {accept} is not supported.")
72+
73+
try:
74+
return self._request(settings.DOCSPEC_API_URL, data, content_type).content
75+
except requests.RequestException as err:
76+
raise ServiceUnavailableError(
77+
"Failed to connect to DocSpec conversion service",
78+
) from err
79+
80+
2281
class YdocConverter:
23-
"""Service class for conversion-related operations."""
82+
"""Service class for YDoc conversion-related operations."""
2483

2584
@property
2685
def auth_header(self):
@@ -45,7 +104,7 @@ def _request(self, url, data, content_type, accept):
45104
return response
46105

47106
def convert(
48-
self, text, content_type="text/markdown", accept="application/vnd.yjs.doc"
107+
self, text, content_type=mime_types.MARKDOWN, accept=mime_types.YJS
49108
):
50109
"""Convert a Markdown text into our internal format using an external microservice."""
51110

@@ -59,14 +118,14 @@ def convert(
59118
content_type,
60119
accept,
61120
)
62-
if accept == "application/vnd.yjs.doc":
121+
if accept == mime_types.YJS:
63122
return b64encode(response.content).decode("utf-8")
64-
if accept in {"text/markdown", "text/html"}:
123+
if accept in {mime_types.MARKDOWN, "text/html"}:
65124
return response.text
66-
if accept == "application/json":
125+
if accept == mime_types.JSON:
67126
return response.json()
68127
raise ValidationError("Unsupported format")
69128
except requests.RequestException as err:
70129
raise ServiceUnavailableError(
71-
"Failed to connect to conversion service",
130+
f"Failed to connect to YDoc conversion service {content_type}, {accept}",
72131
) from err
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
BLOCKNOTE = "application/vnd.blocknote+json"
2+
YJS = "application/vnd.yjs.doc"
3+
MARKDOWN = "text/markdown"
4+
JSON = "application/json"
5+
DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
6+
HTML = "text/html"

src/backend/impress/settings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,12 @@ class Base(Configuration):
680680
environ_prefix=None,
681681
)
682682

683+
# DocSpec API microservice
684+
DOCSPEC_API_URL = values.Value(
685+
environ_name="DOCSPEC_API_URL",
686+
environ_prefix=None
687+
)
688+
683689
# Conversion endpoint
684690
CONVERSION_API_ENDPOINT = values.Value(
685691
default="convert",

0 commit comments

Comments
 (0)