Skip to content

Commit d6e9aa5

Browse files
authored
Merge pull request #26 from TogetherCrew/fix/mediawiki-activities-wrong-arg
feat: add spacy model doc type conversion in mediaWiki load activity!
2 parents 52fcbcc + 42d5d06 commit d6e9aa5

File tree

3 files changed

+8
-1
lines changed

3 files changed

+8
-1
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base
22
WORKDIR /project
33
COPY . .
44
RUN pip install --no-cache-dir -r requirements.txt
5+
RUN python -m spacy download en_core_web_sm
56

67
FROM base AS test
78
RUN chmod +x docker-entrypoint.sh

hivemind_etl/mediawiki/activities.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,11 @@ async def load_mediawiki_data(mediawiki_platform: dict[str, Any]) -> None:
9696
"""Load the transformed MediaWiki data into the database."""
9797
community_id = mediawiki_platform["community_id"]
9898
namespaces = mediawiki_platform["namespaces"]
99+
99100
try:
100-
documents = mediawiki_platform["documents"]
101+
documents_dict = mediawiki_platform["documents"]
102+
# temporal had converted them to dicts, so we need to convert them back to Document objects
103+
documents = [Document.from_dict(doc) for doc in documents_dict]
101104

102105
logging.info(f"Starting data load for community {community_id}")
103106
mediawiki_etl = MediawikiETL(community_id=community_id, namespaces=namespaces)

hivemind_etl/mediawiki/etl.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,13 @@ def transform(self) -> list[Document]:
9494
return documents
9595

9696
def load(self, documents: list[Document]) -> None:
97+
logging.info(f"Loading {len(documents)} documents into Qdrant!")
9798
ingestion_pipeline = CustomIngestionPipeline(
9899
self.community_id, collection_name="mediawiki"
99100
)
100101
ingestion_pipeline.run_pipeline(documents)
102+
logging.info(f"Loaded {len(documents)} documents into Qdrant!")
101103

102104
if self.delete_dump_after_load:
105+
logging.info(f"Removing dump directory {self.dump_dir}!")
103106
shutil.rmtree(self.dump_dir)

0 commit comments

Comments
 (0)