Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base
WORKDIR /project
COPY . .
RUN pip install --no-cache-dir -r requirements.txt
RUN python -m spacy download en_core_web_sm

FROM base AS test
RUN chmod +x docker-entrypoint.sh
Expand Down
3 changes: 3 additions & 0 deletions hivemind_etl/mediawiki/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,13 @@ def transform(self) -> list[Document]:
return documents

def load(self, documents: list[Document]) -> None:
logging.info(f"Loading {len(documents)} documents into Qdrant!")
ingestion_pipeline = CustomIngestionPipeline(
self.community_id, collection_name="mediawiki"
)
ingestion_pipeline.run_pipeline(documents)
logging.info(f"Loaded {len(documents)} documents into Qdrant!")

if self.delete_dump_after_load:
logging.info(f"Removing dump directory {self.dump_dir}!")
shutil.rmtree(self.dump_dir)