fix: replace in-place dataclass mutations with dataclasses.replace()

Aftabbs · aftabbs · commit 2a9e5b74c480 · 2026-03-03T17:33:34.000+05:30
Resolves the in-place mutation warnings introduced by the _warn_on_inplace_mutation guard in PR #10650. Running `hatch run test:unit | grep "Mutating attribute"` surfaced mutations across five components. Each is replaced with `dataclasses.replace(instance, field=new_value)` so that dataclass instances are never mutated after creation. Changed files: - components/builders/chat_prompt_builder.py: replace _content mutation on rendered ChatMessage copy with dataclasses.replace() - core/pipeline/pipeline.py: replace two-field mutation on PipelineSnapshot (agent_snapshot + break_point) with a single dataclasses.replace() call - components/converters/image/file_to_image.py: replace ByteStream.mime_type mutation with dataclasses.replace() - components/extractors/llm_metadata_extractor.py: replace Document.content mutation with dataclasses.replace() (already imported `replace`) - components/fetchers/link_content.py: replace ByteStream.mime_type mutations in both sync and async run() methods - components/joiners/document_joiner.py: replace Document.score mutations in _score_norm, _reciprocal_rank_fusion, and _distribution_based_rank_fusion with non-mutating list comprehensions using dataclasses.replace() Also updates test_document_joiner.py::test_list_with_one_empty_list to compare by document ID rather than object identity, since the test previously relied on the mutation side-effect to make the assertion pass. Fixes #10659
diff --git a/haystack/components/builders/chat_prompt_builder.py b/haystack/components/builders/chat_prompt_builder.py
@@ -4,6 +4,7 @@
 
 import json
 from copy import deepcopy
+from dataclasses import replace
 from typing import Any, Literal
 
 from jinja2.sandbox import SandboxedEnvironment
@@ -267,9 +268,8 @@ def run(
                         raise ValueError(FILTER_NOT_ALLOWED_ERROR_MESSAGE)
                     compiled_template = self._env.from_string(message.text)
                     rendered_text = compiled_template.render(template_variables_combined)
-                    # deep copy the message to avoid modifying the original message
-                    rendered_message: ChatMessage = deepcopy(message)
-                    rendered_message._content = [TextContent(text=rendered_text)]
+                    # use dataclasses.replace to avoid in-place mutation of the copied message
+                    rendered_message: ChatMessage = replace(deepcopy(message), _content=[TextContent(text=rendered_text)])
                     processed_messages.append(rendered_message)
                 else:
                     processed_messages.append(message)
diff --git a/haystack/components/converters/image/file_to_image.py b/haystack/components/converters/image/file_to_image.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import mimetypes
+from dataclasses import replace
 from pathlib import Path
 from typing import Any, Literal
 
@@ -124,7 +125,7 @@ def run(
                 continue
 
             if bytestream.mime_type is None and isinstance(source, Path):
-                bytestream.mime_type = mimetypes.guess_type(source.as_posix())[0]
+                bytestream = replace(bytestream, mime_type=mimetypes.guess_type(source.as_posix())[0])
 
             if bytestream.data == _EMPTY_BYTE_STRING:
                 logger.warning("File {source} is empty. Skipping it.", source=source)
diff --git a/haystack/components/extractors/llm_metadata_extractor.py b/haystack/components/extractors/llm_metadata_extractor.py
@@ -263,7 +263,7 @@ def _prepare_prompts(
                 for idx, page in enumerate(pages["documents"]):
                     if idx + 1 in expanded_range:
                         content += page.content
-                doc_copy.content = content
+                doc_copy = replace(doc_copy, content=content)
             else:
                 doc_copy = document
 
diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import replace
 from fnmatch import fnmatch
 from typing import cast
 
@@ -248,7 +249,7 @@ def run(self, urls: list[str]):
         if len(urls) == 1:
             stream_metadata, stream = self._fetch(urls[0])
             stream.meta.update(stream_metadata)
-            stream.mime_type = stream.meta.get("content_type", None)
+            stream = replace(stream, mime_type=stream.meta.get("content_type", None))
             streams.append(stream)
         else:
             with ThreadPoolExecutor() as executor:
@@ -257,7 +258,7 @@ def run(self, urls: list[str]):
             for stream_metadata, stream in results:  # type: ignore
                 if stream_metadata is not None and stream is not None:
                     stream.meta.update(stream_metadata)
-                    stream.mime_type = stream.meta.get("content_type", None)
+                    stream = replace(stream, mime_type=stream.meta.get("content_type", None))
                     streams.append(stream)
 
         return {"streams": streams}
@@ -302,7 +303,7 @@ async def run_async(self, urls: list[str]):
                 stream_metadata, stream = result_tuple
                 if stream_metadata is not None and stream is not None:
                     stream.meta.update(stream_metadata)
-                    stream.mime_type = stream.meta.get("content_type", None)
+                    stream = replace(stream, mime_type=stream.meta.get("content_type", None))
                     streams.append(stream)
 
         return {"streams": streams}
diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py
@@ -4,6 +4,7 @@
 
 import itertools
 from collections import defaultdict
+from dataclasses import replace
 from enum import Enum
 from math import inf
 from typing import Any
@@ -190,10 +191,7 @@ def _merge(self, document_lists: list[list[Document]]) -> list[Document]:
                 scores_map[doc.id] += (doc.score if doc.score else 0) * weight
                 documents_map[doc.id] = doc
 
-        for doc in documents_map.values():
-            doc.score = scores_map[doc.id]
-
-        return list(documents_map.values())
+        return [replace(doc, score=scores_map[doc.id]) for doc in documents_map.values()]
 
     def _reciprocal_rank_fusion(self, document_lists: list[list[Document]]) -> list[Document]:
         """
@@ -223,10 +221,7 @@ def _reciprocal_rank_fusion(self, document_lists: list[list[Document]]) -> list[
         for _id in scores_map:
             scores_map[_id] /= len(document_lists) / k
 
-        for doc in documents_map.values():
-            doc.score = scores_map[doc.id]
-
-        return list(documents_map.values())
+        return [replace(doc, score=scores_map[doc.id]) for doc in documents_map.values()]
 
     @staticmethod
     def _distribution_based_rank_fusion(document_lists: list[list[Document]]) -> list[Document]:
@@ -236,26 +231,29 @@ def _distribution_based_rank_fusion(document_lists: list[list[Document]]) -> lis
         (https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18)
         If a Document is in more than one retriever, the one with the highest score is used.
         """
+        rescaled_lists: list[list[Document]] = []
         for documents in document_lists:
             if len(documents) == 0:
+                rescaled_lists.append(documents)
                 continue
 
-            scores_list = []
-
-            for doc in documents:
-                scores_list.append(doc.score if doc.score is not None else 0)
+            scores_list = [doc.score if doc.score is not None else 0 for doc in documents]
 
             mean_score = sum(scores_list) / len(scores_list)
             std_dev = (sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)) ** 0.5
             min_score = mean_score - 3 * std_dev
             max_score = mean_score + 3 * std_dev
             delta_score = max_score - min_score
 
-            for doc in documents:
-                doc.score = (doc.score - min_score) / delta_score if delta_score != 0.0 else 0.0
-                # if all docs have the same score delta_score is 0, the docs are uninformative for the query
+            # if all docs have the same score delta_score is 0, the docs are uninformative for the query
+            rescaled_lists.append(
+                [
+                    replace(doc, score=(doc.score - min_score) / delta_score if delta_score != 0.0 else 0.0)
+                    for doc in documents
+                ]
+            )
 
-        return DocumentJoiner._concatenate(document_lists=document_lists)
+        return DocumentJoiner._concatenate(document_lists=rescaled_lists)
 
     def to_dict(self) -> dict[str, Any]:
         """
diff --git a/haystack/core/pipeline/pipeline.py b/haystack/core/pipeline/pipeline.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Mapping
+from dataclasses import replace
 from typing import Any
 
 from haystack import logging, tracing
@@ -409,8 +410,11 @@ def run(  # noqa: PLR0915, PLR0912, C901
                     # agent snapshot and attach it to the pipeline snapshot we create here.
                     # We also update the break_point to be an AgentBreakpoint.
                     if error.pipeline_snapshot and error.pipeline_snapshot.agent_snapshot:
-                        pipeline_snapshot.agent_snapshot = error.pipeline_snapshot.agent_snapshot
-                        pipeline_snapshot.break_point = error.pipeline_snapshot.agent_snapshot.break_point
+                        pipeline_snapshot = replace(
+                            pipeline_snapshot,
+                            agent_snapshot=error.pipeline_snapshot.agent_snapshot,
+                            break_point=error.pipeline_snapshot.agent_snapshot.break_point,
+                        )
 
                     # Attach the pipeline snapshot to the error before re-raising
                     error.pipeline_snapshot = pipeline_snapshot
diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py
@@ -102,7 +102,11 @@ def test_list_with_one_empty_list(self, join_mode: JoinMode):
         joiner = DocumentJoiner(join_mode=join_mode)
         documents = [Document(content="a"), Document(content="b"), Document(content="c")]
         result = joiner.run([[], documents])
-        assert result == {"documents": documents}
+        # Verify the same documents are returned (scoring functions assign scores to the results;
+        # compare by ID to avoid relying on in-place score mutation of the input list).
+        result_ids = {doc.id for doc in result["documents"]}
+        expected_ids = {doc.id for doc in documents}
+        assert result_ids == expected_ids
 
     def test_unsupported_join_mode(self):
         unsupported_mode = "unsupported_mode"