ServiceNow · bigximik · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -32,7 +32,7 @@ jobs:
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
-          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV,DOCS,VISION]"
+          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,STREAMING,DEV,DOCS]"
       - name: Run tests
         run: pytest -v -ra .
 

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -34,7 +34,7 @@ jobs:
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
-          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV,DOCS,VISION]"
+          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV,DOCS]"
       - name: Build the documentation
         run: mkdocs build
 

diff --git a/Dockerfile b/Dockerfile
@@ -39,7 +39,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.5.1
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,STREAMING,DEV]" triton==3.5.1
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM

diff --git a/fast_llm/data/data/data_loader_wrapper.py b/fast_llm/data/data/data_loader_wrapper.py
@@ -0,0 +1,52 @@
+import torch.distributed
+import torch.utils.data.dataloader
+
+from fast_llm.core.distributed import broadcast_object
+
+
+class DistributedDataLoaderWrapper:
+    """
+    Wraps a regular dataloader so that only the process group leader
+    loads data, and then broadcasts the batch to other ranks in the group.
+    """
+
+    def __init__(
+        self,
+        dataloader: torch.utils.data.dataloader.DataLoader | None,
+        rank: int,
+        process_group: torch.distributed.ProcessGroup | None,
+    ):
+        self.dataloader = dataloader
+        self.rank = rank
+        self.process_group = process_group
+
+        assert (self.rank == 0 and self.dataloader is not None) or (self.rank > 0 and self.dataloader is None)
+
+    def __iter__(self):
+        if self.rank == 0:
+            self.iterator = iter(self.dataloader)
+        if self.process_group is None:
+            return self.iterator
+        return self
+
+    def __next__(self):
+        # TODO:
+        # Instead of broadcasting a general object, make this iterator yield an actual Batch class.
+        # Implement `get_state_dict` and `from_state_dict` in the Batch class so that we can
+        # efficiently broadcast tensors directly. This avoids using `broadcast_object` on the
+        # entire Batch object, which is inefficient for tensors because it serializes
+        # (pickles) them before sending.
+
+        if self.rank == 0:
+            try:
+                data = next(self.iterator)  # may raise StopIteration
+            except Exception as e:
+                data = e
+            data = broadcast_object(data, self.process_group, 0)
+        else:
+            data = broadcast_object(None, self.process_group, 0)
+
+        if isinstance(data, Exception):
+            raise data
+
+        return data
diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -8,8 +8,10 @@
 
 from fast_llm.core.distributed import safe_barrier
 from fast_llm.data.data.abstract import Data
+from fast_llm.data.data.data_loader_wrapper import DistributedDataLoaderWrapper
 from fast_llm.data.data.gpt.config import GPTDataConfig
 from fast_llm.data.dataset.abstract import SampledDataset
+from fast_llm.data.dataset.abstract_iterable import SampledIterableDataset
 from fast_llm.data.dataset.config import SamplingParameters
 from fast_llm.data.dataset.gpt.config import GPTSamplingData
 from fast_llm.data.dataset.monitor import DatasetMonitor
@@ -90,7 +92,12 @@ def setup(
                     dataset_name=dataset_name,
                 )
                 dataset = self._config.datasets[dataset_name].build_and_sample(sampling)
-                self._datasets[dataset_name] = DatasetMonitor(dataset, self._config.data_sample_warn_time_ms)
+                if isinstance(dataset, SampledDataset):
+                    self._datasets[dataset_name] = DatasetMonitor(dataset, self._config.data_sample_warn_time_ms)
+                else:
+                    # Do not set monitor for iterable dataset as monitor only works with map style datasets
+                    assert isinstance(dataset, SampledIterableDataset)
+                    self._datasets[dataset_name] = dataset
 
         safe_barrier(self._distributed.world_group, "data_preparation", timeout)
         self._is_setup = True
@@ -116,9 +123,11 @@ def get_iterator(
         Assert.in_range_incl(batch_config.sequence_length, 1, sampling_parameters.sequence_length)
         log_main_rank(f"Initializing {dataset_name} dataset iterator from sample {consumed_samples}...")
 
-        return iter(
-            torch.utils.data.DataLoader(
-                self._datasets[dataset_name],  # noqa
+        dataset = self._datasets[dataset_name]
+
+        if isinstance(dataset, SampledDataset):
+            data_loader = torch.utils.data.DataLoader(
+                dataset,  # noqa
                 batch_sampler=SampledDatasetIterator(
                     total_samples=len(self._datasets[dataset_name]),
                     begin_index=consumed_samples,
@@ -132,4 +141,27 @@ def get_iterator(
                 collate_fn=LanguageModelBatch.from_samples,
                 multiprocessing_context=self._config.multiprocessing_context.value if num_workers > 0 else None,
             )
-        )
+
+        elif isinstance(dataset, SampledIterableDataset):
+            if (
+                self.distributed.model_and_sequence_data_group is None
+                or self.distributed.model_and_sequence_data_group.rank() == 0
+            ):
+                rank = 0
+                data_loader = torch.utils.data.DataLoader(
+                    dataset,  # noqa
+                    batch_size=batch_config.micro_batch_size,
+                    num_workers=0 if num_workers == 0 else 1,
+                    prefetch_factor=prefetch_factor,
+                    pin_memory=True,
+                    collate_fn=LanguageModelBatch.from_samples,
+                    multiprocessing_context=self._config.multiprocessing_context.value if num_workers > 0 else None,
+                )
+            else:
+                rank = self.distributed.model_and_sequence_data_group.rank()
+                data_loader = None
+            data_loader = DistributedDataLoaderWrapper(
+                data_loader, rank, self.distributed.model_and_sequence_data_group
+            )
+
+        return iter(data_loader)
diff --git a/fast_llm/data/dataset/abstract.py b/fast_llm/data/dataset/abstract.py
@@ -44,7 +44,6 @@ def __len__(self) -> int:
 
 
 class SamplableDataset[SampleType: Sample](Dataset[SampleType]):
-
     @abc.abstractmethod
     def sample(self, config: "SamplingData") -> SampledDataset[SampleType]:
         pass
diff --git a/fast_llm/data/dataset/abstract_iterable.py b/fast_llm/data/dataset/abstract_iterable.py
@@ -0,0 +1,30 @@
+import abc
+import typing
+
+import torch.utils.data
+
+from fast_llm.data.sample.abstract import Sample
+
+if typing.TYPE_CHECKING:
+    from fast_llm.data.dataset.config import SamplingData
+
+
+# NOTE: We need to inherit from IterableDataset otherwise torch data loader can not detect it properly
+class SampledIterableDataset[SampleType: Sample](torch.utils.data.IterableDataset[SampleType]):
+    """
+    A sampled dataset class that provides an iterator over samples.
+    """
+
+    # NOTE: We add name here so it is compatible with Fast-LLM Dataset
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """
+        A name for the dataset to facilitate identification and debugging.
+        """
+
+
+class SamplableIterableDataset[SampleType: Sample](SampledIterableDataset[SampleType]):
+    @abc.abstractmethod
+    def sample(self, config: "SamplingData") -> SampledIterableDataset[SampleType]:
+        pass
diff --git a/fast_llm/data/dataset/config.py b/fast_llm/data/dataset/config.py
@@ -7,13 +7,15 @@
 import pathlib
 import typing
 
-from fast_llm.config import Config, Field, FieldHint, UpdateType, check_field, config_class
+from fast_llm.config import Config, Field, FieldHint, FieldUpdate, UpdateType, check_field, config_class
 from fast_llm.data.dataset.abstract import SamplableDataset, SampledDataset
 from fast_llm.data.preprocessing.abstract import PreprocessingConfig
 from fast_llm.data.sample.abstract import Sample
+from fast_llm.redis.config import RedisConfig
 from fast_llm.utils import Assert, normalize_probabilities
 
 if typing.TYPE_CHECKING:
+    from fast_llm.data.dataset.abstract_iterable import SamplableIterableDataset, SampledIterableDataset
     from fast_llm.data.dataset.indexed import ConcatenatedDataset, DatasetSlice, IndexedDataset
     from fast_llm.engine.distributed.distributed import Distributed
 
@@ -106,19 +108,25 @@ class DatasetConfig[SampleType: Sample](Config):
 @config_class(registry=True)
 class SampledDatasetConfig[SampleType: Sample](DatasetConfig[SampleType]):
     """
-    A sampled dataset containing a prepared list of samples to be indexed sequentially (as-is) during training.
+    A sampled dataset containing a prepared list or iterable of samples to be indexed sequentially (as-is) during training.
     """
 
-    def build_and_sample(self, sampling: SamplingData) -> SampledDataset[SampleType]:
+    def build_and_sample(
+        self, sampling: SamplingData
+    ) -> "SampledDataset[SampleType] | SampledIterableDataset[SampleType]":
         raise NotImplementedError()
 
 
 @config_class()
 class SamplableDatasetConfig[SampleType: Sample](SampledDatasetConfig[SampleType]):
-    def build(self, preprocessing: PreprocessingConfig) -> SamplableDataset[SampleType]:
+    def build(
+        self, preprocessing: PreprocessingConfig
+    ) -> "SamplableDataset[SampleType] | SamplableIterableDataset[SampleType]":
         raise NotImplementedError()
 
-    def build_and_sample(self, sampling: SamplingData) -> SampledDataset[SampleType]:
+    def build_and_sample(
+        self, sampling: SamplingData
+    ) -> "SampledDataset[SampleType] | SampledIterableDataset[SampleType]":
         return self.build(sampling.preprocessing).sample(sampling)
 
 
@@ -298,3 +306,91 @@ def build(self, preprocessing: PreprocessingConfig) -> "IndexedDataset[SampleTyp
             return LegacyMemmapDataset[SampleType](name, self.path, preprocessing)
         else:
             raise FileNotFoundError(self.path)
+
+
+@config_class()
+class StreamingDatasetRedisConfig(RedisConfig):
+    stream_key: str = FieldUpdate(default="fast_llm_streaming")
+
+    payload_key: str = FieldUpdate(
+        default="data",
+    )
+
+
+class IngestionType(str, enum.Enum):
+    CONSUMER_GROUP = "consumer_group"
+    ONE_STREAM = "one_stream"
+    N_STREAMS = "n_streams"
+
+
+class HashType(str, enum.Enum):
+    MESSAGE_INDEX = "message_index"
+    """Use the index of the received message for hashing. Provides precise distribution but may not be well shuffled."""
+
+    MESSAGE_ID = "message_id"
+    """Hash messages based on their unique message ID. Good for probabilistic distribution.
+       Redis message IDs are regenerated each time, so this is not reproducible.
+    """
+
+    MESSAGE_BODY = "message_body"
+    """Hash messages based on their payload content (bytes). Distributes messages roughly evenly.
+       Deterministic based on message content, but not perfectly balanced across ranks.
+    """
+
+    PRODUCER_PROVIDED = "producer_provided"
+    """Use the hash or index provided by the producer. Allows deterministic splitting and perfect balance."""
+
+
+@config_class(dynamic_type={SampledDatasetConfig: "streaming"})
+class StreamingDatasetConfig[SampleType: LanguageModelSample](SamplableDatasetConfig[SampleType]):
+    """
+    Configuration for a streaming dataset that reads training data from a Redis stream.
+    """
+
+    _abstract = False
+
+    redis: StreamingDatasetRedisConfig = Field(
+        desc="Redis connection and stream settings used to fetch incoming training data.",
+        hint=FieldHint.core,
+    )
+
+    group_name: str = Field(
+        default="fast_llm_dp_group",
+        desc="Name of the Redis consumer group used for data-parallel reading.",
+        hint=FieldHint.core,
+    )
+
+    consumer_name_prefix: str = Field(
+        default="fast_llm_dp_group_consumer",
+        desc="Prefix used to generate unique consumer names for each rank in Redis consumer group.",
+        hint=FieldHint.core,
+    )
+
+    ingestion_type: IngestionType = Field(
+        default=IngestionType.CONSUMER_GROUP,
+        desc="Strategy used to ingest data from Redis streams (consumer group, single stream, or multiple streams).",
+        hint=FieldHint.core,
+    )
+
+    hash_type: HashType = Field(
+        default=HashType.MESSAGE_ID,
+        desc="How to compute hash for assigning messages to ranks.",
+        hint=FieldHint.core,
+    )
+
+    hash_key: str = Field(
+        default="hash",
+        desc="Key in the message dict containing the hash or index provided by the producer.",
+        hint=FieldHint.core,
+    )
+
+    ack_period_per_consumer: int = Field(
+        default=10,
+        desc="Number of messages after which the consumer acknowledges received IDs back to the Redis hash.",
+        hint=FieldHint.core,
+    )
+
+    def build_and_sample(self, sampling: SamplingData) -> "SampledIterableDataset[SampleType]":
+        from fast_llm.data.dataset.streaming import StreamingDataset
+
+        return StreamingDataset[SampleType](self, sampling.distributed).sample(sampling)
diff --git a/fast_llm/data/dataset/sampled.py b/fast_llm/data/dataset/sampled.py
@@ -9,6 +9,7 @@
 import yaml
 
 from fast_llm.data.dataset.abstract import SampledDataset
+from fast_llm.data.dataset.abstract_iterable import SamplableIterableDataset, SampledIterableDataset
 from fast_llm.data.dataset.config import SamplingData, ShufflingType
 from fast_llm.data.dataset.indexed import IndexedDataset
 from fast_llm.data.sample.abstract import Sample
@@ -429,3 +430,55 @@ def _load_yaml_data(self, data: dict[str, typing.Any]) -> None:
 
         self._unshuffled_tokens = data["unshuffled_tokens"]
         self._unshuffled_documents = data["unshuffled_epochs"] * self._documents_per_epoch
+
+
+class NaiveSampledIterableDataset[SampleType: Sample](SampledIterableDataset[SampleType]):
+    def __init__(
+        self,
+        iterable_dataset: SamplableIterableDataset[SampleType],
+        sampling: SamplingData,
+    ):
+        self._dataset = iterable_dataset
+        self._config = sampling.config
+        self._parameters = sampling.parameters
+
+        assert self._parameters.truncate_documents == False
+        assert self._config.shuffle == ShufflingType.disabled
+
+    def __iter__(self) -> typing.Iterator[SampleType]:
+        sample_length = self._parameters.sequence_length + self._parameters.extra_tokens
+        current_sample_length = 0
+        documents: list[SampleType] = []
+        for doc in self._dataset:
+            if len(doc) > sample_length:
+                logging.warning(f"Dropping doc with length {len(doc)} higher then sample_length {sample_length}")
+                continue
+            if current_sample_length + len(doc) > sample_length:
+                padding_length = sample_length - current_sample_length
+                assert padding_length > 0
+                documents.append(documents[-1].get_padding(padding_length))
+
+                yield documents[0].from_documents(documents)
+
+                documents = [doc]
+                current_sample_length = len(doc)
+            else:
+                documents.append(doc)
+                current_sample_length += len(doc)
+
+            if current_sample_length == sample_length:
+                yield documents[0].from_documents(documents)
+
+                documents = []
+                current_sample_length = 0
+
+        if current_sample_length > 0:
+            padding_length = sample_length - current_sample_length
+            assert padding_length > 0
+            documents.append(documents[-1].get_padding(padding_length))
+
+            yield documents[0].from_documents(documents)
+
+    @property
+    def name(self) -> str:
+        return self._dataset.name