2222 FetchcraftIngestionPipelineFactory ,
2323 IngestionConfig , DefaultIndexFactory ,
2424)
25+ from fetchcraft .connector import Connector
26+ from fetchcraft .connector .filesystem import FilesystemConnector
2527from fetchcraft .document_store import MongoDBDocumentStore
2628from fetchcraft .embeddings import OpenAIEmbeddings
29+ from fetchcraft .ingestion import Source , ConnectorSource
2730from fetchcraft .ingestion .pipeline import TrackedIngestionPipeline
2831from fetchcraft .ingestion .transformations import (
2932 AsyncParsingTransformation ,
@@ -51,6 +54,8 @@ class DefaultPipelineFactory(FetchcraftIngestionPipelineFactory):
5154 index_factory : DefaultIndexFactory
5255 chunker : HierarchicalNodeParser
5356 parser_map : dict [str , DocumentParser ]
57+ directories : list [str ] = []
58+ connector : Connector
5459
5560 def __init__ (self , ** kwargs ):
5661 super ().__init__ (** kwargs )
@@ -65,13 +70,24 @@ def __init__(self, **kwargs):
6570 - Chunking transformation
6671 - Vector index sink
6772 """
68-
69- def configure_pipeline (self , pipeline : TrackedIngestionPipeline ) -> None :
73+
74+ async def create_source (self , documents_path : Path ) -> Source :
75+ """Create a source for the pipeline."""
76+ return ConnectorSource (
77+ connector = self .connector ,
78+ document_root = self ._document_root ,
79+ )
80+
81+ async def configure_pipeline (self , pipeline : TrackedIngestionPipeline ) -> None :
7082 """Configure the pipeline with default transformations and sinks."""
83+ if len (self .directories ) == 0 :
84+ self .directories = await self .connector .list_directories ()
85+
7186 pipeline .add_transformation (AsyncParsingTransformation (parser_map = self .parser_map ))
7287 pipeline .add_transformation (ExtractKeywords ())
7388 pipeline .add_transformation (ChunkingTransformation (chunker = self .chunker ))
7489 pipeline .add_sink (VectorIndexSink (index_factory = self .index_factory ))
90+ pipeline .context ({"directories" : self .directories })
7591
7692
7793
@@ -121,6 +137,11 @@ def get_ingestion_dependencies(settings: IngestionConfig):
121137 # Build callback URL for docling async parsing
122138 callback_url = f"{ settings .callback_base_url } /api/tasks/callback"
123139
140+ connector = FilesystemConnector (
141+ path = settings .documents_path ,
142+ filter = None
143+ )
144+
124145 parser_map = {
125146 "default" : TextFileParser (),
126147 "application/pdf" : RemoteDoclingParser (
@@ -132,6 +153,7 @@ def get_ingestion_dependencies(settings: IngestionConfig):
132153 return {
133154 "index_factory" : index_factory ,
134155 "chunker" : chunker ,
156+ "connector" : connector ,
135157 "parser_map" : parser_map ,
136158 }
137159
0 commit comments