From 3cfb5a64c6c699c08b9a44fdea6b8ae8eb07cd69 Mon Sep 17 00:00:00 2001
From: Sidney Batchelder <44208509+sbatchelder@users.noreply.github.com>
Date: Wed, 15 Jan 2025 07:04:08 -0600
Subject: [PATCH 01/30] [fix] Bad typing for S3ArtifactStorage_clientconfig
 args (#3276)

---
 aim/storage/artifacts/s3_storage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/aim/storage/artifacts/s3_storage.py b/aim/storage/artifacts/s3_storage.py
index bc24c7372..d30951bb1 100644
--- a/aim/storage/artifacts/s3_storage.py
+++ b/aim/storage/artifacts/s3_storage.py
@@ -73,7 +73,7 @@ def _get_s3_client(self):
         return client
 
 
-def S3ArtifactStorage_factory(**boto3_client_kwargs: dict):
+def S3ArtifactStorage_factory(**boto3_client_kwargs):
     class S3ArtifactStorageCustom(S3ArtifactStorage):
         def _get_s3_client(self):
             import boto3
@@ -88,7 +88,7 @@ def _get_s3_client(self):
     return S3ArtifactStorageCustom
 
 
-def S3ArtifactStorage_clientconfig(**boto3_client_kwargs: dict):
+def S3ArtifactStorage_clientconfig(**boto3_client_kwargs):
     from aim.storage.artifacts import registry
 
     registry.registry['s3'] = S3ArtifactStorage_factory(**boto3_client_kwargs)

From 4321b075b02eaf620ac4e97efa47e0adbf9cfdd6 Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:07:09 +0700
Subject: [PATCH 02/30] [docs] Fix pages/bookmarks section links (#3274)

---
 docs/source/ui/pages/bookmarks.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/ui/pages/bookmarks.md b/docs/source/ui/pages/bookmarks.md
index 4d5e080b2..900594023 100644
--- a/docs/source/ui/pages/bookmarks.md
+++ b/docs/source/ui/pages/bookmarks.md
@@ -2,7 +2,7 @@
 
 ### Overview
 
-Use the Bookmarks to save the Aim Explorer state. This includes search query, aggregations and any other modifications applied to the explorer. The Bookmarks page is a list of [cards](#bookmark-card) to quickly access the explorer state with one click.
+Use the Bookmarks to save the Aim Explorer state. This includes search query, aggregations and any other modifications applied to the explorer. The Bookmarks page is a list of [cards](#the-bookmark-card) to quickly access the explorer state with one click.
 
 <img alt="bookmarks" style="border-radius: 8px; border: 1px solid #E8F1FC" src="https://docs-blobs.s3.us-east-2.amazonaws.com/images/ui/pages/bookmarks/bookmarks.png">
 <p>&nbsp;</p>

From 7a98238c890be210f8be8009895d5589dd488ca8 Mon Sep 17 00:00:00 2001
From: Fabian Keller <bluenote10@users.noreply.github.com>
Date: Mon, 20 Jan 2025 08:17:42 +0100
Subject: [PATCH 03/30] [feat] Add `py.typed` marker to allow users to benefit
 from existing type annotations (#3281)

add py.typed marker
---
 aim/py.typed | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 aim/py.typed

diff --git a/aim/py.typed b/aim/py.typed
new file mode 100644
index 000000000..e69de29bb

From eba27a9260d033810d7d4ddbebebe2fb0be81fd0 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Mon, 20 Jan 2025 18:41:04 +0400
Subject: [PATCH 04/30] [fix] Decrease client resources keep-alive time (#3279)

---
 CHANGELOG.md                   | 5 +++++
 aim/ext/transport/heartbeat.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a8e16a51..38700a14a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Unreleased
+
+### Fixes:
+- Decrease client resources keep-alive time (mihran113)
+
 ## 3.27.0 Dec 18, 2024
 
 ### Enhancements:
diff --git a/aim/ext/transport/heartbeat.py b/aim/ext/transport/heartbeat.py
index e8009576b..3346ccff2 100644
--- a/aim/ext/transport/heartbeat.py
+++ b/aim/ext/transport/heartbeat.py
@@ -118,7 +118,7 @@ def reset_responses():
 
 
 class HeartbeatWatcher:
-    CLIENT_KEEP_ALIVE_TIME_DEFAULT = 30 * 60  # 30 minutes
+    CLIENT_KEEP_ALIVE_TIME_DEFAULT = 5 * 60  # 5 minutes
 
     def __init__(self, heartbeat_pool, keep_alive_time: Union[int, float] = CLIENT_KEEP_ALIVE_TIME_DEFAULT):
         self._heartbeat_pool = heartbeat_pool

From 3ae23634682a6cc765d4d4cb11f13a356b5acc93 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Tue, 11 Feb 2025 16:58:22 +0400
Subject: [PATCH 05/30] [fix] Resolve issues on data points connection on epoch
 alignment (#3283)

---
 .github/workflows/nightly-release.yml       |  2 +-
 .github/workflows/pull-request.yml          | 16 ++++-----
 CHANGELOG.md                                |  1 +
 aim/web/api/tags/views.py                   |  3 +-
 aim/web/ui/src/utils/app/alignMetricData.ts | 38 ++++++++++++++-------
 5 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml
index 47b7b2eb1..43707bc87 100644
--- a/.github/workflows/nightly-release.yml
+++ b/.github/workflows/nightly-release.yml
@@ -40,7 +40,7 @@ jobs:
       - name: setup python
         uses: actions/setup-python@v2
         with:
-          python-version: '3.7'
+          python-version: '3.8'
           architecture: x64
 
       - name: install deps
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 152071b5e..348ab9349 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -24,14 +24,14 @@ on:
       - reopened
       - edited
 jobs:
-  validate-naming-convention:
-    name: Pull Request's title matches naming convention
-    runs-on: ubuntu-latest
-    steps:
-      - uses: deepakputhraya/action-pr-title@master
-        with:
-          regex: '^\[(?:feat|fix|doc|refactor|deprecation)\]\s[A-Z].*(?<!\.)$'
-          github_token: ${{ github.token }}
+#  validate-naming-convention:
+#    name: Pull Request's title matches naming convention
+#    runs-on: ubuntu-latest
+#    steps:
+#      - uses: deepakputhraya/action-pr-title@master
+#        with:
+#          regex: '^\[(?:feat|fix|doc|refactor|deprecation)\]\s[A-Z].*(?<!\.)$'
+#          github_token: ${{ github.token }}
   run-checks:
     if: github.event.pull_request.draft == false && github.event.action != 'edited'
     runs-on: ubuntu-latest
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38700a14a..5aa9e56be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixes:
 - Decrease client resources keep-alive time (mihran113)
+- Fix connection of data points on epoch alignment (mihran113)
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/web/api/tags/views.py b/aim/web/api/tags/views.py
index 7b4bfbbec..ce0d32e99 100644
--- a/aim/web/api/tags/views.py
+++ b/aim/web/api/tags/views.py
@@ -41,7 +41,8 @@ async def search_tags_by_name_api(q: Optional[str] = '', factory=Depends(object_
             'id': tag.uuid,
             'name': tag.name,
             'color': tag.color,
-            'description' 'run_count': len(tag.runs),
+            'description': tag.description,
+            'run_count': len(tag.runs),
             'archived': tag.archived,
         }
         for tag in factory.search_tags(q.strip())
diff --git a/aim/web/ui/src/utils/app/alignMetricData.ts b/aim/web/ui/src/utils/app/alignMetricData.ts
index 89129a173..ab623fef3 100644
--- a/aim/web/ui/src/utils/app/alignMetricData.ts
+++ b/aim/web/ui/src/utils/app/alignMetricData.ts
@@ -44,21 +44,33 @@ export function alignByEpoch(
           epochs[epoch] = [metric.data.steps[i]];
         }
       });
-
+      // Get unique epoch values (for ex. (1, 1.495) instead of (1, 1)), because the epochs can be duplicate
+      let xValues = [
+        ...metric.data.epochs.map((epoch, i) => {
+          return (
+            epoch +
+            (epochs[epoch].length > 1
+              ? (0.99 / epochs[epoch].length) *
+                epochs[epoch].indexOf(metric.data.steps[i])
+              : 0)
+          );
+        }),
+      ];
+      let yValues = [...metric.data.values];
+      let pointsArray = [];
+      // Combine the x and y axis arrays into an array of points
+      for (let idx = 0; idx < xValues.length; idx++) {
+        pointsArray[idx] = [xValues[idx], yValues[idx]];
+      }
+      // Sort the combined array based on the first element of the point (epoch)
+      pointsArray.sort(function (a, b) {
+        return a[0] - b[0];
+      });
       metric.data = {
         ...metric.data,
-        xValues: [
-          ...metric.data.epochs.map((epoch, i) => {
-            return (
-              epoch +
-              (epochs[epoch].length > 1
-                ? (0.99 / epochs[epoch].length) *
-                  epochs[epoch].indexOf(metric.data.steps[i])
-                : 0)
-            );
-          }),
-        ],
-        yValues: [...metric.data.values],
+        // Separate the x and y axis values back into xValues and yValues
+        xValues: pointsArray.map((point) => point[0]),
+        yValues: pointsArray.map((point) => point[1]),
       };
     }
   }

From c6e0c7f60a684e0bfe2e54ed28b93ed5e4fe4100 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Wed, 12 Feb 2025 11:06:26 +0400
Subject: [PATCH 06/30] [fix] Correct indentation on query proxy object return
 statement (#3287)

---
 aim/storage/proxy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aim/storage/proxy.py b/aim/storage/proxy.py
index d9c62c8c2..8d967837e 100644
--- a/aim/storage/proxy.py
+++ b/aim/storage/proxy.py
@@ -174,7 +174,7 @@ def __call__(self):
             if self.cache is not None and cache_key is not None:
                 self.cache[cache_key] = res
 
-            return res
+        return res
 
 
 class AimObjectProxy(with_metaclass(_ObjectProxyMetaType)):

From 2d9f3b8b0f4ef23fc94818847e48307dbc5f6564 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Wed, 12 Feb 2025 11:24:08 +0400
Subject: [PATCH 07/30] [feat] Skip metrics check when run is known to yield
 false result (#3288)

* [feat] Skip metrics check when run is known to yeld false result

* [fix] Code style checks

* [fix] More styling errors
---
 aim/sdk/query_analyzer.py      | 151 +++++++++++++++++++++++++++++++++
 aim/sdk/sequence_collection.py |  46 ++++++----
 2 files changed, 182 insertions(+), 15 deletions(-)
 create mode 100644 aim/sdk/query_analyzer.py

diff --git a/aim/sdk/query_analyzer.py b/aim/sdk/query_analyzer.py
new file mode 100644
index 000000000..1930bcf78
--- /dev/null
+++ b/aim/sdk/query_analyzer.py
@@ -0,0 +1,151 @@
+import ast
+
+from typing import Any, List, Tuple
+
+
+class Unknown(ast.AST):
+    pass
+
+
+Unknown = Unknown()  # create a single instance of <unknown> value node
+
+
+class QueryExpressionTransformer(ast.NodeTransformer):
+    def __init__(self, *, var_names: List[str]):
+        self._var_names = var_names
+
+    def transform(self, expr: str) -> Tuple[str, bool]:
+        node = ast.parse(expr, mode='eval')
+        transformed = self.visit(node)
+        if transformed is Unknown:
+            return expr, False
+        else:
+            return ast.unparse(transformed), True
+
+    def visit_Expression(self, node: ast.Expression) -> Any:
+        node: ast.Expression = self.generic_visit(node)
+        if node.body is Unknown:
+            return Unknown
+        return node
+
+    def visit_Expr(self, node: ast.Expr) -> Any:
+        node: ast.Expr = self.generic_visit(node)
+        if node.value is Unknown:
+            return Unknown
+        return node
+
+    def visit_Constant(self, node: ast.Constant) -> Any:
+        return node
+
+    def visit_JoinedStr(self, node: ast.JoinedStr) -> Any:
+        node: ast.JoinedStr = self.generic_visit(node)
+        for val in node.values:
+            if val is Unknown:
+                return Unknown
+        return node
+
+    def visit_FormattedValue(self, node: ast.FormattedValue) -> Any:
+        node: ast.FormattedValue = self.generic_visit(node)
+        if node.value is Unknown:
+            return Unknown
+        return node
+
+    def visit_Name(self, node: ast.Name) -> Any:
+        if node.id in self._var_names:
+            return Unknown
+        else:
+            return node
+
+    def visit_Compare(self, node: ast.Compare) -> Any:
+        node: ast.Compare = self.generic_visit(node)
+        if node.left is Unknown:
+            return Unknown
+        for comp in node.comparators:
+            if comp is Unknown:
+                return Unknown
+        return node
+
+    def visit_List(self, node: ast.List) -> Any:
+        node: ast.List = self.generic_visit(node)
+        for sub in node.elts:
+            if sub is Unknown:
+                return Unknown
+        return node
+
+    def visit_Tuple(self, node: ast.Tuple) -> Any:
+        node: ast.Tuple = self.generic_visit(node)
+        for sub in node.elts:
+            if sub is Unknown:
+                return Unknown
+        return node
+
+    def visit_Dict(self, node: ast.Dict) -> Any:
+        node: ast.Dict = self.generic_visit(node)
+        for key in node.keys:
+            if key is Unknown:
+                return Unknown
+        for val in node.values:
+            if val is Unknown:
+                return Unknown
+        return node
+
+    def visit_BoolOp(self, node: ast.BoolOp) -> Any:
+        node: ast.BoolOp = self.generic_visit(node)
+        node_values = list(filter(lambda x: x is not Unknown, node.values))
+        if isinstance(node.op, ast.And):
+            if len(node_values) == 1:
+                return node_values[0]
+            elif len(node_values) == 0:
+                return Unknown
+        else:
+            if len(node_values) < len(node.values):
+                return Unknown
+        return ast.BoolOp(op=node.op, values=node_values)
+
+    def visit_UnaryOp(self, node: ast.UnaryOp) -> Any:
+        node: ast.UnaryOp = self.generic_visit(node)
+        if node.operand is Unknown:
+            return Unknown
+        return node
+
+    def visit_BinOp(self, node: ast.BinOp) -> Any:
+        node: ast.BinOp = self.generic_visit(node)
+        if node.left is Unknown or node.right is Unknown:
+            return Unknown
+        return node
+
+    def visit_IfExp(self, node: ast.IfExp) -> Any:
+        node: ast.IfExp = self.generic_visit(node)
+        if node.test is Unknown or node.body is Unknown or node.orelse is Unknown:
+            return Unknown
+        return node
+
+    def visit_Attribute(self, node: ast.Attribute) -> Any:
+        node: ast.Attribute = self.generic_visit(node)
+        if node.value is Unknown:
+            return Unknown
+        return node
+
+    def visit_Call(self, node: ast.Call) -> Any:
+        node: ast.Call = self.generic_visit(node)
+        if node.func is Unknown:
+            return Unknown
+        for arg in node.args:
+            if arg is Unknown:
+                return Unknown
+        for kwarg in node.keywords:
+            if kwarg is Unknown:
+                return Unknown
+        return node
+
+    def visit_Subscript(self, node: ast.Subscript) -> Any:
+        node: ast.Subscript = self.generic_visit(node)
+        if node.value is Unknown or node.slice is Unknown:
+            return Unknown
+        return node
+
+    def visit_Slice(self, node: ast.Slice) -> Any:
+        node: ast.Slice = self.generic_visit(node)
+        if node.lower is Unknown or node.upper is Unknown or node.step is Unknown:
+            return Unknown
+        return node
diff --git a/aim/sdk/sequence_collection.py b/aim/sdk/sequence_collection.py
index 5738a8e28..3c4699bc2 100644
--- a/aim/sdk/sequence_collection.py
+++ b/aim/sdk/sequence_collection.py
@@ -3,17 +3,20 @@
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Iterator
 
+from tqdm import tqdm
+
+from aim.sdk.query_analyzer import QueryExpressionTransformer
 from aim.sdk.query_utils import RunView, SequenceView
 from aim.sdk.sequence import Sequence
 from aim.sdk.types import QueryReportMode
 from aim.storage.query import RestrictedPythonQuery
-from tqdm import tqdm
 
 
 if TYPE_CHECKING:
+    from pandas import DataFrame
+
     from aim.sdk.repo import Repo
     from aim.sdk.run import Run
-    from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -170,20 +173,33 @@ def iter_runs(self) -> Iterator['SequenceCollection']:
         if self.report_mode == QueryReportMode.PROGRESS_BAR:
             progress_bar = tqdm(total=total_runs)
 
+        seq_var = self.seq_cls.sequence_name()
+        t = QueryExpressionTransformer(var_names=[seq_var, ])
+        run_expr, is_transformed = t.transform(self.query)
+        run_query = RestrictedPythonQuery(run_expr)
+
         for run in runs_iterator:
-            seq_collection = SingleRunSequenceCollection(
-                run,
-                self.seq_cls,
-                self.query,
-                runs_proxy_cache=self.runs_proxy_cache,
-                timezone_offset=self._timezone_offset,
-            )
-            if self.report_mode == QueryReportMode.PROGRESS_TUPLE:
-                yield seq_collection, (runs_counter, total_runs)
-            else:
-                if self.report_mode == QueryReportMode.PROGRESS_BAR:
-                    progress_bar.update(1)
-                yield seq_collection
+            check_run_sequences = True
+            if is_transformed:
+                run_view = RunView(run, runs_proxy_cache=self.runs_proxy_cache, timezone_offset=self._timezone_offset)
+                match = run_query.check(**{'run': run_view})
+                if not match:
+                    check_run_sequences = False
+
+            if check_run_sequences:
+                seq_collection = SingleRunSequenceCollection(
+                    run,
+                    self.seq_cls,
+                    self.query,
+                    runs_proxy_cache=self.runs_proxy_cache,
+                    timezone_offset=self._timezone_offset,
+                )
+                if self.report_mode == QueryReportMode.PROGRESS_TUPLE:
+                    yield seq_collection, (runs_counter, total_runs)
+                else:
+                    if self.report_mode == QueryReportMode.PROGRESS_BAR:
+                        progress_bar.update(1)
+                    yield seq_collection
             runs_counter += 1
 
     def iter(self) -> Iterator[Sequence]:

From 17660207610e554776c6ddd945d8133716151421 Mon Sep 17 00:00:00 2001
From: Fabian Keller <bluenote10@users.noreply.github.com>
Date: Thu, 13 Feb 2025 14:33:17 +0100
Subject: [PATCH 08/30] [chore] Bump ruff version from 0.3.3 to 0.9.2 and fix
 some invalid/dead noqas (#3282)

---
 aim/acme.py                              |  3 ++-
 aim/cli/convert/commands.py              |  2 +-
 aim/cli/init/commands.py                 |  4 ++--
 aim/cli/runs/utils.py                    |  2 +-
 aim/cli/server/commands.py               |  2 +-
 aim/cli/up/commands.py                   |  2 +-
 aim/ext/notifier/notifier.py             |  2 +-
 aim/ext/sshfs/utils.py                   |  2 +-
 aim/ext/tensorboard_tracker/tracker.py   |  2 +-
 aim/ext/transport/heartbeat.py           |  6 ++----
 aim/ext/transport/message_utils.py       |  3 ++-
 aim/ext/transport/utils.py               |  2 +-
 aim/fastai.py                            |  2 +-
 aim/hf_dataset.py                        |  2 +-
 aim/hugging_face.py                      |  2 +-
 aim/keras.py                             |  3 ++-
 aim/keras_tuner.py                       |  2 +-
 aim/mxnet.py                             |  2 +-
 aim/optuna.py                            |  2 +-
 aim/paddle.py                            |  2 +-
 aim/prophet.py                           |  2 +-
 aim/pytorch.py                           |  3 ++-
 aim/pytorch_ignite.py                    |  2 +-
 aim/pytorch_lightning.py                 |  2 +-
 aim/sb3.py                               |  2 +-
 aim/sdk/adapters/fastai.py               |  8 ++++++--
 aim/sdk/adapters/keras.py                |  2 +-
 aim/sdk/adapters/lightgbm.py             |  3 +--
 aim/sdk/adapters/mxnet.py                |  2 +-
 aim/sdk/adapters/pytorch_ignite.py       |  5 ++---
 aim/sdk/adapters/xgboost.py              |  3 +--
 aim/sdk/callbacks/caller.py              |  2 +-
 aim/sdk/objects/io/wavfile.py            | 12 ++++++------
 aim/sdk/repo.py                          |  4 ++--
 aim/sdk/sequences/figure_sequence.py     |  2 +-
 aim/sdk/types.py                         |  3 ++-
 aim/storage/hashing/hashing.py           |  2 +-
 aim/storage/proxy.py                     |  4 ++--
 aim/storage/query.py                     |  2 +-
 aim/storage/types.py                     |  4 +++-
 aim/tensorflow.py                        |  3 ++-
 aim/utils/__init__.py                    |  3 ++-
 aim/web/api/dashboard_apps/views.py      |  2 +-
 aim/web/api/dashboards/views.py          |  8 ++++----
 aim/web/api/experiments/views.py         |  2 +-
 aim/web/api/runs/object_views.py         |  6 +++---
 aim/web/middlewares/profiler/profiler.py |  2 +-
 examples/pytorch_lightning_track.py      |  2 +-
 examples/pytorch_track.py                |  2 +-
 examples/pytorch_track_images.py         |  2 +-
 performance_tests/sdk/queries.py         |  6 +++---
 requirements.dev.txt                     |  2 +-
 ruff.toml                                |  6 +++++-
 53 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/aim/acme.py b/aim/acme.py
index 44884fd7d..30cce3037 100644
--- a/aim/acme.py
+++ b/aim/acme.py
@@ -1,2 +1,3 @@
 # Alias to SDK acme interface
-from aim.sdk.adapters.acme import AimCallback, AimWriter  # noqa F401
+from aim.sdk.adapters.acme import AimCallback as AimCallback
+from aim.sdk.adapters.acme import AimWriter as AimWriter
diff --git a/aim/cli/convert/commands.py b/aim/cli/convert/commands.py
index 4e23ef806..14160a536 100644
--- a/aim/cli/convert/commands.py
+++ b/aim/cli/convert/commands.py
@@ -40,7 +40,7 @@ def convert_tensorboard(ctx, logdir, flat, no_cache):
 @click.option('--flat', '-f', required=False, is_flag=True, default=False)
 def convert_tensorflow(ctx, logdir, flat):
     click.secho(
-        "WARN: Command 'tf' is deprecated and will be removed in future releases," " please use 'tensorboard' instead.",
+        "WARN: Command 'tf' is deprecated and will be removed in future releases, please use 'tensorboard' instead.",
         fg='red',
     )
     repo_inst = ctx.obj['repo_inst']
diff --git a/aim/cli/init/commands.py b/aim/cli/init/commands.py
index 1ef6fc354..4d538ec84 100644
--- a/aim/cli/init/commands.py
+++ b/aim/cli/init/commands.py
@@ -20,7 +20,7 @@ def init(repo, yes, skip_if_exists):
     re_init = False
     if Repo.exists(repo_path):
         if yes and skip_if_exists:
-            raise click.BadParameter('Conflicting init options.' 'Either specify -y/--yes or -s/--skip-if-exists')
+            raise click.BadParameter('Conflicting init options.Either specify -y/--yes or -s/--skip-if-exists')
         elif yes:
             re_init = True
         elif skip_if_exists:
@@ -28,7 +28,7 @@ def init(repo, yes, skip_if_exists):
             return
         else:
             re_init = click.confirm(
-                'Aim repository is already initialized. ' 'Do you want to re-initialize to empty Aim repository?'
+                'Aim repository is already initialized. Do you want to re-initialize to empty Aim repository?'
             )
         if not re_init:
             return
diff --git a/aim/cli/runs/utils.py b/aim/cli/runs/utils.py
index ec4c332f7..f2b64b13d 100644
--- a/aim/cli/runs/utils.py
+++ b/aim/cli/runs/utils.py
@@ -48,7 +48,7 @@ def upload_repo_runs(buffer: io.BytesIO, bucket_name: str) -> Tuple[bool, str]:
         import boto3
     except ImportError:
         raise RuntimeError(
-            "This command requires 'boto3' to be installed. " 'Please install it with command: \n pip install boto3'
+            "This command requires 'boto3' to be installed. Please install it with command: \n pip install boto3"
         )
 
     try:
diff --git a/aim/cli/server/commands.py b/aim/cli/server/commands.py
index b84c1ae68..7c1587021 100644
--- a/aim/cli/server/commands.py
+++ b/aim/cli/server/commands.py
@@ -95,5 +95,5 @@ def server(host, port, repo, ssl_keyfile, ssl_certfile, base_path, log_level, de
         )
         exec_cmd(cmd, stream_output=True)
     except ShellCommandException:
-        click.echo('Failed to run Aim Tracking Server. ' 'Please see the logs above for details.')
+        click.echo('Failed to run Aim Tracking Server. Please see the logs above for details.')
         exit(1)
diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index 2044c75e4..de8ed008d 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -96,7 +96,7 @@ def up(
         db_cmd = build_db_upgrade_command()
         exec_cmd(db_cmd, stream_output=True)
     except ShellCommandException:
-        click.echo('Failed to initialize Aim DB. ' 'Please see the logs above for details.')
+        click.echo('Failed to initialize Aim DB. Please see the logs above for details.')
         return
 
     if port == 0:
diff --git a/aim/ext/notifier/notifier.py b/aim/ext/notifier/notifier.py
index 1b237af62..ca73fe59a 100644
--- a/aim/ext/notifier/notifier.py
+++ b/aim/ext/notifier/notifier.py
@@ -34,7 +34,7 @@ def notify(self, message: Optional[str] = None, **kwargs):
                 except Exception as e:
                     attempt += 1
                     if attempt == self.MAX_RETRIES:
-                        logger.error(f'Notifier {sub} failed to send message "{message}". ' f'No retries left.')
+                        logger.error(f'Notifier {sub} failed to send message "{message}". No retries left.')
                         raise NotificationSendError(e)
                     else:
                         logger.error(
diff --git a/aim/ext/sshfs/utils.py b/aim/ext/sshfs/utils.py
index 3dba53168..170f9d4fe 100644
--- a/aim/ext/sshfs/utils.py
+++ b/aim/ext/sshfs/utils.py
@@ -197,7 +197,7 @@ def unmount_remote_repo(mount_point: str, mount_root: str):
     if exit_code != 0:
         # in case of failure log warning so the user can unmount manually if needed
         logger.warning(
-            f'Could not unmount path: {mount_point}.\n' f'Please unmount manually using command:\n' f'{" ".join(cmd)}'
+            f'Could not unmount path: {mount_point}.\nPlease unmount manually using command:\n{" ".join(cmd)}'
         )
     else:
         shutil.rmtree(mount_root)
diff --git a/aim/ext/tensorboard_tracker/tracker.py b/aim/ext/tensorboard_tracker/tracker.py
index 59af672f8..f902e0806 100644
--- a/aim/ext/tensorboard_tracker/tracker.py
+++ b/aim/ext/tensorboard_tracker/tracker.py
@@ -31,7 +31,7 @@ def _decode_histogram(value):
 
     # This is a bit weird but it seems the histogram counts is usually padded by 0 as tensorboard
     # only stores the right limits?
-    # See https://github.com/pytorch/pytorch/blob/7d2a18da0b3427fcbe44b461a0aa508194535885/torch/utils/tensorboard/summary.py#L390 # noqa
+    # See https://github.com/pytorch/pytorch/blob/7d2a18da0b3427fcbe44b461a0aa508194535885/torch/utils/tensorboard/summary.py#L390
     bin_counts = bin_counts[1:]
 
     bin_range = (bucket_limits[0], bucket_limits[-1])
diff --git a/aim/ext/transport/heartbeat.py b/aim/ext/transport/heartbeat.py
index 3346ccff2..d6390d63e 100644
--- a/aim/ext/transport/heartbeat.py
+++ b/aim/ext/transport/heartbeat.py
@@ -15,10 +15,8 @@ class HeartbeatSender(object):
     HEARTBEAT_INTERVAL_DEFAULT = 10
     NETWORK_CHECK_INTERVAL = 180
 
-    NETWORK_UNSTABLE_WARNING_TEMPLATE = (
-        'Network connection between client `{}` ' 'and server `{}` appears to be unstable.'
-    )
-    NETWORK_ABSENT_WARNING_TEMPLATE = 'Network connection between client `{}` ' 'and server `{}` appears to be absent.'
+    NETWORK_UNSTABLE_WARNING_TEMPLATE = 'Network connection between client `{}` and server `{}` appears to be unstable.'
+    NETWORK_ABSENT_WARNING_TEMPLATE = 'Network connection between client `{}` and server `{}` appears to be absent.'
 
     def __init__(
         self,
diff --git a/aim/ext/transport/message_utils.py b/aim/ext/transport/message_utils.py
index 127b1f7e5..daa1823b6 100644
--- a/aim/ext/transport/message_utils.py
+++ b/aim/ext/transport/message_utils.py
@@ -5,7 +5,8 @@
 from typing import Iterator, Tuple
 
 from aim.storage.object import CustomObject
-from aim.storage.treeutils import decode_tree, encode_tree  # noqa
+from aim.storage.treeutils import decode_tree as decode_tree
+from aim.storage.treeutils import encode_tree as encode_tree
 from aim.storage.types import BLOB
 
 
diff --git a/aim/ext/transport/utils.py b/aim/ext/transport/utils.py
index 037ad1262..b556692fe 100644
--- a/aim/ext/transport/utils.py
+++ b/aim/ext/transport/utils.py
@@ -13,7 +13,7 @@ def inner(func):
         def wrapper(*args, **kwargs):
             try:
                 return func(*args, **kwargs)
-            except exc_type as e:  # noqa
+            except exc_type:
                 if error_message is not None:
                     logger.error(error_message)
                     raise RuntimeError(error_message)
diff --git a/aim/fastai.py b/aim/fastai.py
index 275882894..ab00bee14 100644
--- a/aim/fastai.py
+++ b/aim/fastai.py
@@ -1,2 +1,2 @@
 # Alias to SDK fast.ai interface
-from aim.sdk.adapters.fastai import AimCallback  # noqa F401
+from aim.sdk.adapters.fastai import AimCallback as AimCallback
diff --git a/aim/hf_dataset.py b/aim/hf_dataset.py
index e629b15c9..00ecc9cc9 100644
--- a/aim/hf_dataset.py
+++ b/aim/hf_dataset.py
@@ -1,2 +1,2 @@
 # Alias to SDK Hugging Face Datasets interface
-from aim.sdk.objects.plugins.hf_datasets_metadata import HFDataset  # noqa F401
+from aim.sdk.objects.plugins.hf_datasets_metadata import HFDataset as HFDataset
diff --git a/aim/hugging_face.py b/aim/hugging_face.py
index 9fbde32ec..692ec2486 100644
--- a/aim/hugging_face.py
+++ b/aim/hugging_face.py
@@ -1,2 +1,2 @@
 # Alias to SDK Hugging Face interface
-from aim.sdk.adapters.hugging_face import AimCallback  # noqa F401
+from aim.sdk.adapters.hugging_face import AimCallback as AimCallback
diff --git a/aim/keras.py b/aim/keras.py
index 3383dff65..e1c6ed28f 100644
--- a/aim/keras.py
+++ b/aim/keras.py
@@ -1,2 +1,3 @@
 # Alias to SDK Keras interface
-from aim.sdk.adapters.keras import AimCallback, AimTracker  # noqa F401
+from aim.sdk.adapters.keras import AimCallback as AimCallback
+from aim.sdk.adapters.keras import AimTracker as AimTracker
diff --git a/aim/keras_tuner.py b/aim/keras_tuner.py
index 5f6577cae..5d264e64d 100644
--- a/aim/keras_tuner.py
+++ b/aim/keras_tuner.py
@@ -1,2 +1,2 @@
 # Alias to SDK Keras-Tuner interface
-from aim.sdk.adapters.keras_tuner import AimCallback  # noqa F401
+from aim.sdk.adapters.keras_tuner import AimCallback as AimCallback
diff --git a/aim/mxnet.py b/aim/mxnet.py
index 403d33d40..ceacfb118 100644
--- a/aim/mxnet.py
+++ b/aim/mxnet.py
@@ -1,2 +1,2 @@
 # Alias to SDK mxnet interface
-from aim.sdk.adapters.mxnet import AimLoggingHandler  # noqa F401
+from aim.sdk.adapters.mxnet import AimLoggingHandler as AimLoggingHandler
diff --git a/aim/optuna.py b/aim/optuna.py
index 5069d2469..28d0b1dbf 100644
--- a/aim/optuna.py
+++ b/aim/optuna.py
@@ -1,2 +1,2 @@
 # Alias to SDK Optuna interface
-from aim.sdk.adapters.optuna import AimCallback  # noqa F401
+from aim.sdk.adapters.optuna import AimCallback as AimCallback
diff --git a/aim/paddle.py b/aim/paddle.py
index 0c4948641..9069d936a 100644
--- a/aim/paddle.py
+++ b/aim/paddle.py
@@ -1,2 +1,2 @@
 # Alias to SDK PaddlePaddle interface
-from aim.sdk.adapters.paddle import AimCallback  # noqa F401
+from aim.sdk.adapters.paddle import AimCallback as AimCallback
diff --git a/aim/prophet.py b/aim/prophet.py
index 1a43316f4..661e95cd4 100644
--- a/aim/prophet.py
+++ b/aim/prophet.py
@@ -1,2 +1,2 @@
 # Alias to SDK Prophet interface
-from aim.sdk.adapters.prophet import AimLogger  # noqa F401
+from aim.sdk.adapters.prophet import AimLogger as AimLogger
diff --git a/aim/pytorch.py b/aim/pytorch.py
index c493b7a84..677a68f88 100644
--- a/aim/pytorch.py
+++ b/aim/pytorch.py
@@ -1,2 +1,3 @@
 # Alias to SDK PyTorch utils
-from aim.sdk.adapters.pytorch import track_params_dists, track_gradients_dists  # noqa
+from aim.sdk.adapters.pytorch import track_gradients_dists as track_gradients_dists
+from aim.sdk.adapters.pytorch import track_params_dists as track_params_dists
diff --git a/aim/pytorch_ignite.py b/aim/pytorch_ignite.py
index 08cd67ce7..2189c6ddf 100644
--- a/aim/pytorch_ignite.py
+++ b/aim/pytorch_ignite.py
@@ -1,2 +1,2 @@
 # Alias to SDK PyTorch Ignite interface
-from aim.sdk.adapters.pytorch_ignite import AimLogger  # noqa F401
+from aim.sdk.adapters.pytorch_ignite import AimLogger as AimLogger
diff --git a/aim/pytorch_lightning.py b/aim/pytorch_lightning.py
index 50d10c1aa..b9a3405f9 100644
--- a/aim/pytorch_lightning.py
+++ b/aim/pytorch_lightning.py
@@ -1,2 +1,2 @@
 # Alias to SDK PyTorch Lightning interface
-from aim.sdk.adapters.pytorch_lightning import AimLogger  # noqa F401
+from aim.sdk.adapters.pytorch_lightning import AimLogger as AimLogger
diff --git a/aim/sb3.py b/aim/sb3.py
index 43fd7899e..78bdec8ee 100644
--- a/aim/sb3.py
+++ b/aim/sb3.py
@@ -1,2 +1,2 @@
 # Alias to SDK sb3 interface
-from aim.sdk.adapters.sb3 import AimCallback  # noqa F401
+from aim.sdk.adapters.sb3 import AimCallback as AimCallback
diff --git a/aim/sdk/adapters/fastai.py b/aim/sdk/adapters/fastai.py
index 37390444c..88b7c4fdd 100644
--- a/aim/sdk/adapters/fastai.py
+++ b/aim/sdk/adapters/fastai.py
@@ -11,7 +11,7 @@
     from fastcore.basics import detuplify, ignore_exceptions, store_attr
 except ImportError:
     raise RuntimeError(
-        'This contrib module requires fastai to be installed. ' 'Please install it with command: \n pip install fastai'
+        'This contrib module requires fastai to be installed. Please install it with command: \n pip install fastai'
     )
 
 logger = getLogger(__name__)
@@ -107,7 +107,11 @@ def gather_args(self):
             args['n_inp'] = n_inp
             xb = self.dls.valid.one_batch()[:n_inp]
             args.update(
-                {f'input {n+1} dim {i+1}': d for n in range(n_inp) for i, d in enumerate(list(detuplify(xb[n]).shape))}
+                {
+                    f'input {n + 1} dim {i + 1}': d
+                    for n in range(n_inp)
+                    for i, d in enumerate(list(detuplify(xb[n]).shape))
+                }
             )
         except Exception:
             logger.warning('Failed to gather input dimensions')
diff --git a/aim/sdk/adapters/keras.py b/aim/sdk/adapters/keras.py
index 4a2141249..10af8b711 100644
--- a/aim/sdk/adapters/keras.py
+++ b/aim/sdk/adapters/keras.py
@@ -9,7 +9,7 @@
     from keras.callbacks import Callback
 except ImportError:
     raise RuntimeError(
-        'This contrib module requires keras to be installed. ' 'Please install it with command: \n pip install keras'
+        'This contrib module requires keras to be installed. Please install it with command: \n pip install keras'
     )
 
 
diff --git a/aim/sdk/adapters/lightgbm.py b/aim/sdk/adapters/lightgbm.py
index f2bae4e16..f006cd971 100644
--- a/aim/sdk/adapters/lightgbm.py
+++ b/aim/sdk/adapters/lightgbm.py
@@ -8,8 +8,7 @@
     from lightgbm.callback import CallbackEnv
 except ImportError:
     raise RuntimeError(
-        'This contrib module requires Lightgbm to be installed. '
-        'Please install it with command: \n pip install lightgbm'
+        'This contrib module requires Lightgbm to be installed. Please install it with command: \n pip install lightgbm'
     )
 
 
diff --git a/aim/sdk/adapters/mxnet.py b/aim/sdk/adapters/mxnet.py
index e10d4a19c..88f005dd8 100644
--- a/aim/sdk/adapters/mxnet.py
+++ b/aim/sdk/adapters/mxnet.py
@@ -75,7 +75,7 @@ def train_begin(self, estimator: Optional[Estimator], *args, **kwargs):
         optimizer = trainer.optimizer.__class__.__name__
         lr = trainer.learning_rate
 
-        estimator.logger.info('Training begin: using optimizer %s ' 'with current learning rate %.4f ', optimizer, lr)
+        estimator.logger.info('Training begin: using optimizer %s with current learning rate %.4f ', optimizer, lr)
         if estimator.max_epoch:
             estimator.logger.info('Train for %d epochs.', estimator.max_epoch)
         else:
diff --git a/aim/sdk/adapters/pytorch_ignite.py b/aim/sdk/adapters/pytorch_ignite.py
index 42cf7d0f2..6a9506c54 100644
--- a/aim/sdk/adapters/pytorch_ignite.py
+++ b/aim/sdk/adapters/pytorch_ignite.py
@@ -8,7 +8,7 @@
     from torch.optim import Optimizer
 except ImportError:
     raise RuntimeError(
-        'This contrib module requires PyTorch to be installed. ' 'Please install it with command: \n pip install torch'
+        'This contrib module requires PyTorch to be installed. Please install it with command: \n pip install torch'
     )
 try:
     from ignite.contrib.handlers.base_logger import (
@@ -185,8 +185,7 @@ def __call__(self, engine: Engine, logger: AimLogger, event_name: Union[str, Eve
 
         if not isinstance(global_step, int):
             raise TypeError(
-                f'global_step must be int, got {type(global_step)}.'
-                ' Please check the output of global_step_transform.'
+                f'global_step must be int, got {type(global_step)}. Please check the output of global_step_transform.'
             )
 
         metrics = {}
diff --git a/aim/sdk/adapters/xgboost.py b/aim/sdk/adapters/xgboost.py
index 8d9926287..832110f25 100644
--- a/aim/sdk/adapters/xgboost.py
+++ b/aim/sdk/adapters/xgboost.py
@@ -8,8 +8,7 @@
     from xgboost.callback import TrainingCallback
 except ImportError:
     raise RuntimeError(
-        'This contrib module requires XGBoost to be installed. '
-        'Please install it with command: \n pip install xgboost'
+        'This contrib module requires XGBoost to be installed. Please install it with command: \n pip install xgboost'
     )
 
 
diff --git a/aim/sdk/callbacks/caller.py b/aim/sdk/callbacks/caller.py
index 6ac0c29ae..387406e22 100644
--- a/aim/sdk/callbacks/caller.py
+++ b/aim/sdk/callbacks/caller.py
@@ -42,7 +42,7 @@ def trigger(self, event_name: str, **kwargs):
         for handler in handlers:
             try:
                 handler(**all_kwargs)
-            except Exception:  # noqa
+            except Exception:
                 # TODO catch errors on handler invocation (nice-to-have)
                 logger.warning(f"Failed to run callback '{handler.__name__}'.")
                 logger.warning(traceback.format_exc())
diff --git a/aim/sdk/objects/io/wavfile.py b/aim/sdk/objects/io/wavfile.py
index 5c58daf4a..34d187c7a 100644
--- a/aim/sdk/objects/io/wavfile.py
+++ b/aim/sdk/objects/io/wavfile.py
@@ -316,7 +316,7 @@ def _raise_bad_format(format_tag):
     except ValueError:
         format_name = f'{format_tag:#06x}'
     raise ValueError(
-        f"Unknown wave file format: {format_name}. Supported formats: {', '.join(x.name for x in KNOWN_WAVE_FORMATS)}"
+        f'Unknown wave file format: {format_name}. Supported formats: {", ".join(x.name for x in KNOWN_WAVE_FORMATS)}'
     )
 
 
@@ -447,12 +447,12 @@ def _read_data_chunk(fid, format_tag, channels, bit_depth, is_big_endian, block_
             # Remaining bit depths can map directly to signed numpy dtypes
             dtype = f'{fmt}i{bytes_per_sample}'
         else:
-            raise ValueError('Unsupported bit depth: the WAV file ' f'has {bit_depth}-bit integer data.')
+            raise ValueError(f'Unsupported bit depth: the WAV file has {bit_depth}-bit integer data.')
     elif format_tag == WAVE_FORMAT.IEEE_FLOAT:
         if bit_depth in {32, 64}:
             dtype = f'{fmt}f{bytes_per_sample}'
         else:
-            raise ValueError('Unsupported bit depth: the WAV file ' f'has {bit_depth}-bit floating-point data.')
+            raise ValueError(f'Unsupported bit depth: the WAV file has {bit_depth}-bit floating-point data.')
     else:
         _raise_bad_format(format_tag)
 
@@ -480,7 +480,7 @@ def _read_data_chunk(fid, format_tag, channels, bit_depth, is_big_endian, block_
             data = numpy.memmap(fid, dtype=dtype, mode='c', offset=start, shape=(n_samples,))
             fid.seek(start + size)
         else:
-            raise ValueError('mmap=True not compatible with ' f'{bytes_per_sample}-byte container size.')
+            raise ValueError(f'mmap=True not compatible with {bytes_per_sample}-byte container size.')
 
     _handle_pad_byte(fid, size)
 
@@ -516,7 +516,7 @@ def _read_riff_chunk(fid):
         fmt = '>I'
     else:
         # There are also .wav files with "FFIR" or "XFIR" signatures?
-        raise ValueError(f'File format {repr(str1)} not understood. Only ' "'RIFF' and 'RIFX' supported.")
+        raise ValueError(f"File format {repr(str1)} not understood. Only 'RIFF' and 'RIFX' supported.")
 
     # Size of entire file
     file_size = struct.unpack(fmt, fid.read(4))[0] + 8
@@ -554,7 +554,7 @@ def read(buffer, mmap=False):
                 if data_chunk_received:
                     # End of file but data successfully read
                     warnings.warn(
-                        'Reached EOF prematurely; finished at {:d} bytes, ' 'expected {:d} bytes from header.'.format(
+                        'Reached EOF prematurely; finished at {:d} bytes, expected {:d} bytes from header.'.format(
                             fid.tell(), file_size
                         ),
                         WavFileWarning,
diff --git a/aim/sdk/repo.py b/aim/sdk/repo.py
index 992794964..6d2471cb2 100644
--- a/aim/sdk/repo.py
+++ b/aim/sdk/repo.py
@@ -985,7 +985,7 @@ def _backup_run(self, run_hash):
         from aim.sdk.utils import backup_run
 
         if self.is_remote_repo:
-            self._remote_repo_proxy._restore_run(run_hash)  # noqa
+            self._remote_repo_proxy._restore_run(run_hash)
         else:
             backup_run(self, run_hash)
 
@@ -993,7 +993,7 @@ def _restore_run(self, run_hash):
         from aim.sdk.utils import restore_run_backup
 
         if self.is_remote_repo:
-            self._remote_repo_proxy._restore_run(run_hash)  # noqa
+            self._remote_repo_proxy._restore_run(run_hash)
         else:
             restore_run_backup(self, run_hash)
 
diff --git a/aim/sdk/sequences/figure_sequence.py b/aim/sdk/sequences/figure_sequence.py
index ff6081e60..885828f79 100644
--- a/aim/sdk/sequences/figure_sequence.py
+++ b/aim/sdk/sequences/figure_sequence.py
@@ -9,7 +9,7 @@ class Figures(Sequence):
 
     @classmethod
     def allowed_dtypes(cls) -> Union[str, Tuple[str, ...]]:
-        return (Figure.get_typename(),)  # noqa : need a tuple for consitancy
+        return (Figure.get_typename(),)  # need a tuple for consitancy
 
     @classmethod
     def sequence_name(cls) -> str:
diff --git a/aim/sdk/types.py b/aim/sdk/types.py
index 51fdc72cd..aa70e24ec 100644
--- a/aim/sdk/types.py
+++ b/aim/sdk/types.py
@@ -1,6 +1,7 @@
-from aim.storage.types import *  # noqa F401
 from enum import Enum
 
+from aim.storage.types import *  # noqa: F403
+
 
 class QueryReportMode(Enum):
     DISABLED = 0
diff --git a/aim/storage/hashing/hashing.py b/aim/storage/hashing/hashing.py
index 1aaa7e52e..eef53c5d2 100644
--- a/aim/storage/hashing/hashing.py
+++ b/aim/storage/hashing/hashing.py
@@ -11,7 +11,7 @@
 
 from typing import Tuple, Union
 
-from aim.storage.encoding import decode_int64, encode_int64  # noqa
+from aim.storage.encoding import decode_int64, encode_int64
 from aim.storage.hashing import c_hash
 from aim.storage.types import (
     AimObject,
diff --git a/aim/storage/proxy.py b/aim/storage/proxy.py
index 8d967837e..cf3d84fca 100644
--- a/aim/storage/proxy.py
+++ b/aim/storage/proxy.py
@@ -192,8 +192,8 @@ def __name__(self, value):
     def __class__(self):
         return self.__wrapped__().__class__
 
-    @__class__.setter  # noqa
-    def __class__(self, value):  # noqa
+    @__class__.setter
+    def __class__(self, value):
         self.__wrapped__().__class__ = value
 
     @property
diff --git a/aim/storage/query.py b/aim/storage/query.py
index 0ada6f153..82de23657 100644
--- a/aim/storage/query.py
+++ b/aim/storage/query.py
@@ -52,7 +52,7 @@ def safer_getattr(object, name, default=None, getattr=getattr):
     if name == 'format' and isinstance(object, str):
         raise NotImplementedError('Using format() on a %s is not safe.' % object.__class__.__name__)
     if name[0] == '_':
-        raise AttributeError('"{name}" is an invalid attribute name because it ' 'starts with "_"'.format(name=name))
+        raise AttributeError('"{name}" is an invalid attribute name because it starts with "_"'.format(name=name))
     val = getattr(object, name, default)
     return val
 
diff --git a/aim/storage/types.py b/aim/storage/types.py
index a21caa061..6fbf6e012 100644
--- a/aim/storage/types.py
+++ b/aim/storage/types.py
@@ -1,6 +1,8 @@
-from aim.storage.utils import BLOB  # noqa F401
 from typing import Dict, List, Tuple, Union
 
+from aim.storage.utils import BLOB as BLOB
+
+
 NoneType = type(None)
 
 
diff --git a/aim/tensorflow.py b/aim/tensorflow.py
index 93ccaed16..17cee6549 100644
--- a/aim/tensorflow.py
+++ b/aim/tensorflow.py
@@ -1,2 +1,3 @@
 # Alias to SDK TensorFlow Keras interface
-from aim.sdk.adapters.tensorflow import AimCallback, AimTracker  # noqa F401
+from aim.sdk.adapters.tensorflow import AimCallback as AimCallback
+from aim.sdk.adapters.tensorflow import AimTracker as AimTracker
diff --git a/aim/utils/__init__.py b/aim/utils/__init__.py
index 761d0cd34..c48598750 100644
--- a/aim/utils/__init__.py
+++ b/aim/utils/__init__.py
@@ -1 +1,2 @@
-from aim.ext.exception_resistant import enable_safe_mode, disable_safe_mode  # noqa
+from aim.ext.exception_resistant import disable_safe_mode as disable_safe_mode
+from aim.ext.exception_resistant import enable_safe_mode as enable_safe_mode
diff --git a/aim/web/api/dashboard_apps/views.py b/aim/web/api/dashboard_apps/views.py
index 50fb87123..7f1acae26 100644
--- a/aim/web/api/dashboard_apps/views.py
+++ b/aim/web/api/dashboard_apps/views.py
@@ -19,7 +19,7 @@
 
 @dashboard_apps_router.get('/', response_model=ExploreStateListOut)
 async def dashboard_apps_list_api(session: Session = Depends(get_session)):
-    explore_states = session.query(ExploreState).filter(ExploreState.is_archived == False)  # noqa
+    explore_states = session.query(ExploreState).filter(ExploreState.is_archived == False)  # noqa: E712
     result = []
     for es in explore_states:
         result.append(explore_state_response_serializer(es))
diff --git a/aim/web/api/dashboards/views.py b/aim/web/api/dashboards/views.py
index fe12bf69d..5cd16c302 100644
--- a/aim/web/api/dashboards/views.py
+++ b/aim/web/api/dashboards/views.py
@@ -19,7 +19,7 @@
 
 @dashboards_router.get('/', response_model=List[DashboardOut])
 async def dashboards_list_api(session: Session = Depends(get_session)):
-    dashboards_query = session.query(Dashboard).filter(Dashboard.is_archived == False).order_by(Dashboard.updated_at)  # noqa
+    dashboards_query = session.query(Dashboard).filter(Dashboard.is_archived == False).order_by(Dashboard.updated_at)  # noqa: E712
     result = []
 
     for dashboard in dashboards_query:
@@ -50,7 +50,7 @@ async def dashboards_post_api(request_data: DashboardCreateIn, session: Session
 
 @dashboards_router.get('/{dashboard_id}/', response_model=DashboardOut)
 async def dashboards_get_api(dashboard_id: str, session: Session = Depends(get_session)):
-    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa
+    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa: E712
     if not dashboard:
         raise HTTPException(status_code=404)
 
@@ -61,7 +61,7 @@ async def dashboards_get_api(dashboard_id: str, session: Session = Depends(get_s
 async def dashboards_put_api(
     dashboard_id: str, request_data: DashboardUpdateIn, session: Session = Depends(get_session)
 ):
-    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa
+    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa: E712
     if not dashboard:
         raise HTTPException(status_code=404)
     dashboard_name = request_data.name
@@ -77,7 +77,7 @@ async def dashboards_put_api(
 
 @dashboards_router.delete('/{dashboard_id}/')
 async def dashboards_delete_api(dashboard_id: str, session: Session = Depends(get_session)):
-    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa
+    dashboard = session.query(Dashboard).filter(Dashboard.uuid == dashboard_id, Dashboard.is_archived == False).first()  # noqa: E712
     if not dashboard:
         raise HTTPException(status_code=404)
 
diff --git a/aim/web/api/experiments/views.py b/aim/web/api/experiments/views.py
index 9164eb78f..a47511cd9 100644
--- a/aim/web/api/experiments/views.py
+++ b/aim/web/api/experiments/views.py
@@ -114,7 +114,7 @@ async def update_experiment_properties_api(exp_id: str, exp_in: ExperimentUpdate
     if exp_in.archived is not None:
         if exp_in.archived and len(exp.runs) > 0:
             raise HTTPException(
-                status_code=400, detail=(f"Cannot archive experiment '{exp_id}'. " 'Experiment has associated runs.')
+                status_code=400, detail=(f"Cannot archive experiment '{exp_id}'. Experiment has associated runs.")
             )
         exp.archived = exp_in.archived
 
diff --git a/aim/web/api/runs/object_views.py b/aim/web/api/runs/object_views.py
index 8e5d5b3f5..6a4b4d0e9 100644
--- a/aim/web/api/runs/object_views.py
+++ b/aim/web/api/runs/object_views.py
@@ -32,7 +32,7 @@
 class CustomObjectApiConfig:
     sequence_type: type = Sequence
     resolve_blobs: bool = False
-    dump_record_fn: callable = lambda x: x.data  # noqa E731
+    dump_record_fn: callable = lambda x: x.data
     model: type = BaseModel
 
     @staticmethod
@@ -165,7 +165,7 @@ class TextApiConfig(CustomObjectApiConfig):
 class DistributionApiConfig(CustomObjectApiConfig):
     sequence_type = Distributions
     resolve_blobs = True
-    dump_record_fn = lambda x: numpy_to_encodable(x.weights)  # noqa E731
+    dump_record_fn = lambda x: numpy_to_encodable(x.weights)  # noqa: E731
     model = DistributionInfo
 
 
@@ -178,5 +178,5 @@ class AudioApiConfig(CustomObjectApiConfig):
 class FigureApiConfig(CustomObjectApiConfig):
     sequence_type = Figures
     resolve_blobs = True
-    dump_record_fn = lambda x: x.data  # noqa E731
+    dump_record_fn = lambda x: x.data  # noqa: E731
     model = FigureInfo
diff --git a/aim/web/middlewares/profiler/profiler.py b/aim/web/middlewares/profiler/profiler.py
index 0956dc8b5..fc39f4006 100644
--- a/aim/web/middlewares/profiler/profiler.py
+++ b/aim/web/middlewares/profiler/profiler.py
@@ -61,7 +61,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
         profiler = self.profiler(interval=self._profiler_interval)
         try:
             profiler.start()
-        except:  # noqa
+        except:  # noqa: E722
             skip_profiling = True
         else:
             skip_profiling = False
diff --git a/examples/pytorch_lightning_track.py b/examples/pytorch_lightning_track.py
index 9d9d8c98b..3a4b23c7c 100644
--- a/examples/pytorch_lightning_track.py
+++ b/examples/pytorch_lightning_track.py
@@ -4,7 +4,7 @@
 
 if importlib.util.find_spec('lightning'):
     import lightning.pytorch as pl
-elif importlib.util.find_spec('pytorch_lightning'):  # noqa F401
+elif importlib.util.find_spec('pytorch_lightning'):  # F401
     import pytorch_lightning as pl
 else:
     raise RuntimeError(
diff --git a/examples/pytorch_track.py b/examples/pytorch_track.py
index 3927356cb..3c68bd51d 100644
--- a/examples/pytorch_track.py
+++ b/examples/pytorch_track.py
@@ -90,7 +90,7 @@ def forward(self, x):
 
         if i % 30 == 0:
             logging.info(
-                'Epoch [{}/{}], Step [{}/{}], ' 'Loss: {:.4f}'.format(
+                'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                     epoch + 1, num_epochs, i + 1, total_step, loss.item()
                 )
             )
diff --git a/examples/pytorch_track_images.py b/examples/pytorch_track_images.py
index adb693a2f..bcf4f627d 100644
--- a/examples/pytorch_track_images.py
+++ b/examples/pytorch_track_images.py
@@ -94,7 +94,7 @@ def forward(self, x):
 
         if i % 30 == 0:
             logging.info(
-                'Epoch [{}/{}], Step [{}/{}], ' 'Loss: {:.4f}'.format(
+                'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                     epoch + 1, num_epochs, i + 1, total_step, loss.item()
                 )
             )
diff --git a/performance_tests/sdk/queries.py b/performance_tests/sdk/queries.py
index 514ce5583..38b79bf3d 100644
--- a/performance_tests/sdk/queries.py
+++ b/performance_tests/sdk/queries.py
@@ -1,7 +1,7 @@
-query_0 = 'run.hparams.benchmark == "glue" ' 'and run.hparams.dataset == "cola" ' 'and metric.context.subset != "train"'
-query_1 = 'run.hparams.benchmark == "glue" ' 'and run.hparams.dataset == "cola"'
+query_0 = 'run.hparams.benchmark == "glue" and run.hparams.dataset == "cola" and metric.context.subset != "train"'
+query_1 = 'run.hparams.benchmark == "glue" and run.hparams.dataset == "cola"'
 query_2 = 'run.hparams.benchmark == "glue"'
-query_3 = 'run.hparams.dataset == "cola" ' 'and run.experiment.name != "baseline-warp_4-cola"'
+query_3 = 'run.hparams.dataset == "cola" and run.experiment.name != "baseline-warp_4-cola"'
 
 
 queries = {
diff --git a/requirements.dev.txt b/requirements.dev.txt
index 936433a7b..b36a02f59 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,3 +1,3 @@
 wheel >= 0.31.0
 twine >= 1.11.0
-ruff == 0.3.3
+ruff == 0.9.2
diff --git a/ruff.toml b/ruff.toml
index d164ced82..e23481fd0 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -11,7 +11,11 @@ exclude = [
 inline-quotes = "single"
 
 [lint]
-extend-select = ["I"]
+extend-select = [
+  "I",
+  "PGH004", # blanket-noqa
+  "RUF100", # unused-noqa
+]
 
 [lint.isort]
 no-lines-before = ["future", "standard-library", "first-party"]

From db6fcc1e09351811682e5a58f8250e5bc9af2bc7 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Tue, 18 Feb 2025 16:29:53 +0400
Subject: [PATCH 09/30] [fix] Move performance tests to local mac mini (#3290)

---
 .github/workflows/pull-request.yml |  4 +--
 performance_tests/BASELINE         | 42 +++++++++++++++---------------
 performance_tests/conftest.py      |  2 +-
 tests/requirements.txt             |  1 +
 4 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 348ab9349..46fc96103 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -68,8 +68,8 @@ jobs:
 
   storage-performance-checks:
     needs: run-checks
-    concurrency: perf-tests
-    runs-on: [self-hosted, performance-tests]
+    concurrency: storage-performance-checks
+    runs-on: [self-hosted, perf-tests]
     name: Performance tests
     steps:
       - name: checkout
diff --git a/performance_tests/BASELINE b/performance_tests/BASELINE
index 1f403fadb..ebe22e938 100644
--- a/performance_tests/BASELINE
+++ b/performance_tests/BASELINE
@@ -1,21 +1,21 @@
-test_collect_metrics_data_0 1.8545958518981933
-test_collect_metrics_data_1 1.9959398269653321
-test_collect_metrics_data_2 10.835494375228881
-test_collect_metrics_data_3 1.8672633171081543
-test_collect_runs_data_0 0.8988437175750732
-test_collect_runs_data_1 1.039186429977417
-test_collect_runs_data_2 3.469265604019165
-test_collect_runs_data_3 0.9486905574798584
-test_query_metrics_0 1.6766140460968018
-test_query_metrics_1 1.6763684749603271
-test_query_metrics_2 1.6051365375518798
-test_query_metrics_3 1.5391615390777589
-test_query_runs_0 0.8991998672485352
-test_query_runs_1 0.9259328842163086
-test_query_runs_2 0.839762544631958
-test_query_runs_3 0.832861852645874
-test_container_open 0.1440361499786377
-test_iterative_access 4.000607919692993
-test_random_access_0 0.663770055770874
-test_random_access_1 1.4745195388793946
-test_random_access_2 2.424658107757568
\ No newline at end of file
+test_collect_metrics_data_0 0.3717397689819336
+test_collect_metrics_data_1 0.3963047981262207
+test_collect_metrics_data_2 2.7405614376068117
+test_collect_metrics_data_3 0.3710219860076904
+test_collect_runs_data_0 0.17322354316711425
+test_collect_runs_data_1 0.20246338844299316
+test_collect_runs_data_2 0.7970072269439697
+test_collect_runs_data_3 0.1911233901977539
+test_query_metrics_0 0.311903190612793
+test_query_metrics_1 0.3122593879699707
+test_query_metrics_2 0.3092495441436768
+test_query_metrics_3 0.288785982131958
+test_query_runs_0 0.17433061599731445
+test_query_runs_1 0.17484822273254394
+test_query_runs_2 0.17181901931762694
+test_query_runs_3 0.1616499423980713
+test_container_open 0.04026708602905273
+test_iterative_access 1.1857992172241212
+test_random_access_0 0.14068403244018554
+test_random_access_1 0.26419754028320314
+test_random_access_2 0.3941319942474365
\ No newline at end of file
diff --git a/performance_tests/conftest.py b/performance_tests/conftest.py
index a53afa2e7..95fa82005 100644
--- a/performance_tests/conftest.py
+++ b/performance_tests/conftest.py
@@ -43,7 +43,7 @@ def pytest_sessionstart(session):
         _init_test_repos()
     else:
         # github actions performance tests on self hosted runner
-        os.chdir('/home/ubuntu/performance_logs/')
+        os.chdir('/Users/github/workers/perf-tests/actions-runner/_work/performance_logs')
     time.sleep(10)
 
 
diff --git a/tests/requirements.txt b/tests/requirements.txt
index e3135d0bf..f2e592cb9 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -2,6 +2,7 @@
 torch
 tensorflow
 deeplake<4.0.0 # update when proper documentation is available
+azure-storage-blob # for deeplake
 # hub
 fastapi>=0.87.0
 httpx

From 07338cae07b5e119777cabd3d976d56187178be7 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Mon, 24 Feb 2025 23:03:22 +0400
Subject: [PATCH 10/30] [fix] Resolve session refresh issues when db file is
 replaced (#3294)

---
 aim/cli/storage/commands.py    |  2 +-
 aim/sdk/query_analyzer.py      | 13 ++++++++++++-
 aim/sdk/run.py                 |  2 +-
 aim/sdk/sequence_collection.py | 12 +++++++-----
 aim/storage/structured/db.py   |  4 ++--
 aim/utils/deprecation.py       |  8 ++++----
 aim/web/api/db.py              |  7 ++++---
 setup.py                       |  4 ++++
 8 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/aim/cli/storage/commands.py b/aim/cli/storage/commands.py
index 60ec26614..3210f7a69 100644
--- a/aim/cli/storage/commands.py
+++ b/aim/cli/storage/commands.py
@@ -56,7 +56,7 @@ def to_3_11(ctx, hashes, yes):
         try:
             run = Run(run_hash, repo=repo)
             if run.check_metrics_version():
-                backup_run(run)
+                backup_run(repo, run.hash)
                 run.update_metrics()
                 index_manager.index(run_hash)
             else:
diff --git a/aim/sdk/query_analyzer.py b/aim/sdk/query_analyzer.py
index 1930bcf78..7c7065a24 100644
--- a/aim/sdk/query_analyzer.py
+++ b/aim/sdk/query_analyzer.py
@@ -10,6 +10,17 @@ class Unknown(ast.AST):
 Unknown = Unknown()  # create a single instance of <unknown> value node
 
 
+def unparse(*args, **kwargs):
+    import sys
+
+    if sys.version_info.minor < 9:
+        import astunparse
+
+        return astunparse.unparse(*args, **kwargs)
+    else:
+        return ast.unparse(*args, **kwargs)
+
+
 class QueryExpressionTransformer(ast.NodeTransformer):
     def __init__(self, *, var_names: List[str]):
         self._var_names = var_names
@@ -20,7 +31,7 @@ def transform(self, expr: str) -> Tuple[str, bool]:
         if transformed is Unknown:
             return expr, False
         else:
-            return ast.unparse(transformed), True
+            return unparse(transformed), True
 
     def visit_Expression(self, node: ast.Expression) -> Any:
         node: ast.Expression = self.generic_visit(node)
diff --git a/aim/sdk/run.py b/aim/sdk/run.py
index 08b89b8c3..59bc4d806 100644
--- a/aim/sdk/run.py
+++ b/aim/sdk/run.py
@@ -287,7 +287,7 @@ def __init__(
                     raise RuntimeError
                 else:
                     logger.warning(f'Detected sub-optimal format metrics for Run {self.hash}. Upgrading...')
-                    backup_path = backup_run(self)
+                    backup_path = backup_run(self.repo, self.hash)
                     try:
                         self.update_metrics()
                         logger.warning(f'Successfully converted Run {self.hash}')
diff --git a/aim/sdk/sequence_collection.py b/aim/sdk/sequence_collection.py
index 3c4699bc2..62c083d45 100644
--- a/aim/sdk/sequence_collection.py
+++ b/aim/sdk/sequence_collection.py
@@ -3,20 +3,18 @@
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Iterator
 
-from tqdm import tqdm
-
 from aim.sdk.query_analyzer import QueryExpressionTransformer
 from aim.sdk.query_utils import RunView, SequenceView
 from aim.sdk.sequence import Sequence
 from aim.sdk.types import QueryReportMode
 from aim.storage.query import RestrictedPythonQuery
+from tqdm import tqdm
 
 
 if TYPE_CHECKING:
-    from pandas import DataFrame
-
     from aim.sdk.repo import Repo
     from aim.sdk.run import Run
+    from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -174,7 +172,11 @@ def iter_runs(self) -> Iterator['SequenceCollection']:
             progress_bar = tqdm(total=total_runs)
 
         seq_var = self.seq_cls.sequence_name()
-        t = QueryExpressionTransformer(var_names=[seq_var, ])
+        t = QueryExpressionTransformer(
+            var_names=[
+                seq_var,
+            ]
+        )
         run_expr, is_transformed = t.transform(self.query)
         run_query = RestrictedPythonQuery(run_expr)
 
diff --git a/aim/storage/structured/db.py b/aim/storage/structured/db.py
index 3632bafa3..2849b3be4 100644
--- a/aim/storage/structured/db.py
+++ b/aim/storage/structured/db.py
@@ -63,7 +63,6 @@ def __init__(self, path: str, readonly: bool = False):
         self.engine = create_engine(
             self.db_url, echo=(logging.INFO >= int(os.environ.get(AIM_LOG_LEVEL_KEY, logging.WARNING)))
         )
-        self.session_cls = scoped_session(sessionmaker(autoflush=False, bind=self.engine))
         self._upgraded = None
 
     @classmethod
@@ -91,7 +90,8 @@ def caches(self):
         return self._caches
 
     def get_session(self, autocommit=True):
-        session = self.session_cls()
+        session_cls = scoped_session(sessionmaker(autoflush=False, bind=self.engine))
+        session = session_cls()
         setattr(session, 'autocommit', autocommit)
         return session
 
diff --git a/aim/utils/deprecation.py b/aim/utils/deprecation.py
index 46bd86266..11bf5af30 100644
--- a/aim/utils/deprecation.py
+++ b/aim/utils/deprecation.py
@@ -10,11 +10,11 @@ def python_version_deprecation_check():
     import sys
 
     version_info = sys.version_info
-    if version_info.major == 3 and version_info.minor == 6:
+    if version_info.major == 3 and version_info.minor == 7:
         deprecation_warning(
-            remove_version='3.16',
-            msg='Python 3.6 has reached EOL. Aim support for Python 3.6 is deprecated!',
-            remove_msg_template='Python 3.6 support will be dropped in',
+            remove_version='3.17',
+            msg='Python 3.7 has reached EOL. Aim support for Python 3.7 is deprecated!',
+            remove_msg_template='Python 3.7 support will be dropped in',
         )
 
 
diff --git a/aim/web/api/db.py b/aim/web/api/db.py
index 459fd3112..80aeaa539 100644
--- a/aim/web/api/db.py
+++ b/aim/web/api/db.py
@@ -15,12 +15,12 @@
     echo=(logging.INFO >= int(os.environ.get(AIM_LOG_LEVEL_KEY, logging.WARNING))),
     connect_args={'check_same_thread': False},
 )
-SessionLocal = sessionmaker(autoflush=False, bind=engine)
 Base = declarative_base()
 
 
 def get_session():
-    session = SessionLocal()
+    session_cls = sessionmaker(autoflush=False, bind=engine)
+    session = session_cls()
     try:
         yield session
     finally:
@@ -29,7 +29,8 @@ def get_session():
 
 @contextmanager
 def get_contexted_session():
-    session = SessionLocal()
+    session_cls = sessionmaker(autoflush=False, bind=engine)
+    session = session_cls()
     try:
         yield session
     finally:
diff --git a/setup.py b/setup.py
index 1cfacd0ca..00725b280 100644
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,9 @@ def package_files(directory):
     'boto3',
 ]
 
+if sys.version_info.minor < 9:
+    REQUIRED += ['astunparse']
+
 
 class UploadCommand(Command):
     """Support setup.py upload."""
@@ -194,6 +197,7 @@ def cytonize_extensions():
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'Programming Language :: Python :: Implementation :: PyPy',
     ],
     ext_modules=cytonize_extensions(),

From fe316ddf7bd4395fcdf6453bc3b70a71b1182c02 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Tue, 4 Mar 2025 19:07:15 +0400
Subject: [PATCH 11/30] [fix] Resolve issue with adding duplicate tags (#3296)

---
 CHANGELOG.md                                  | 6 ++++++
 aim/storage/structured/sql_engine/entities.py | 9 +++++++++
 aim/utils/deprecation.py                      | 2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5aa9e56be..a67e85036 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@
 ### Fixes:
 - Decrease client resources keep-alive time (mihran113)
 - Fix connection of data points on epoch alignment (mihran113)
+- Resolve issue with adding duplicate tags to the same run (mihran113)
+- Resolve session refresh issues when db file is replaced (mihran113)
+
+### Enhancements:
+- Skip metrics check when run is known to yield false result (alberttorosyan)
+- Correct indentation on query proxy object return statement (alberttorosyan)
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/storage/structured/sql_engine/entities.py b/aim/storage/structured/sql_engine/entities.py
index 84c72158c..554d4c70d 100644
--- a/aim/storage/structured/sql_engine/entities.py
+++ b/aim/storage/structured/sql_engine/entities.py
@@ -1,3 +1,5 @@
+import logging
+
 from typing import Collection, List, Optional, Union
 
 import pytz
@@ -26,6 +28,9 @@
 from sqlalchemy.orm import joinedload
 
 
+logger = logging.getLogger(__name__)
+
+
 def session_commit_or_flush(session):
     if getattr(session, 'autocommit', True) and sa_version >= '2.0.0':
         session.commit()
@@ -227,6 +232,10 @@ def unsafe_add_tag():
             self._model.tags.append(tag)
             session.add(self._model)
 
+        if value in self.tags:
+            logger.warning(f'Tag with value: {value} is already present in this run.')
+            return
+
         session = self._session
         unsafe_add_tag()
         try:
diff --git a/aim/utils/deprecation.py b/aim/utils/deprecation.py
index 11bf5af30..b0bf2a70f 100644
--- a/aim/utils/deprecation.py
+++ b/aim/utils/deprecation.py
@@ -12,7 +12,7 @@ def python_version_deprecation_check():
     version_info = sys.version_info
     if version_info.major == 3 and version_info.minor == 7:
         deprecation_warning(
-            remove_version='3.17',
+            remove_version='3.30',
             msg='Python 3.7 has reached EOL. Aim support for Python 3.7 is deprecated!',
             remove_msg_template='Python 3.7 support will be dropped in',
         )

From b6c0b1f59825b60478aede026cc8f2a91af983f2 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Tue, 11 Mar 2025 16:24:03 +0100
Subject: [PATCH 12/30] [fix] Message stream parsing (#3298)

---
 CHANGELOG.md                       |  1 +
 aim/ext/transport/message_utils.py | 15 +--------------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a67e85036..7dcc243e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 ### Enhancements:
 - Skip metrics check when run is known to yield false result (alberttorosyan)
 - Correct indentation on query proxy object return statement (alberttorosyan)
+- Fix spurious assertion error in message stream parsing (qzed)
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/ext/transport/message_utils.py b/aim/ext/transport/message_utils.py
index daa1823b6..cc0422e5a 100644
--- a/aim/ext/transport/message_utils.py
+++ b/aim/ext/transport/message_utils.py
@@ -46,22 +46,9 @@ def pack_stream(tree: Iterator[Tuple[bytes, bytes]]) -> bytes:
             yield struct.pack('I', len(key)) + key + struct.pack('?', True) + struct.pack('I', len(val)) + val
 
 
-def unpack_helper(msg: bytes) -> Tuple[bytes, bytes]:
-    (key_size,), tail = struct.unpack('I', msg[:4]), msg[4:]
-    key, tail = tail[:key_size], tail[key_size:]
-    (is_blob,), tail = struct.unpack('?', tail[:1]), tail[1:]
-    (value_size,), tail = struct.unpack('I', tail[:4]), tail[4:]
-    value, tail = tail[:value_size], tail[value_size:]
-    assert len(tail) == 0
-    if is_blob:
-        yield key, BLOB(data=value)
-    else:
-        yield key, value
-
-
 def unpack_stream(stream) -> Tuple[bytes, bytes]:
     for msg in stream:
-        yield from unpack_helper(msg)
+        yield from unpack_args(msg)
 
 
 def raise_exception(server_exception):

From 3c40c8350e8eb0b06d375870eb1e18e833a071d0 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:45:41 +0400
Subject: [PATCH 13/30] [fix] Handle empty queries (#3299)

* [fix] Handle empty queries

* [fix] Formatting issues
---
 aim/sdk/query_analyzer.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/aim/sdk/query_analyzer.py b/aim/sdk/query_analyzer.py
index 7c7065a24..f06d589f0 100644
--- a/aim/sdk/query_analyzer.py
+++ b/aim/sdk/query_analyzer.py
@@ -1,4 +1,5 @@
 import ast
+import sys
 
 from typing import Any, List, Tuple
 
@@ -9,15 +10,14 @@ class Unknown(ast.AST):
 
 Unknown = Unknown()  # create a single instance of <unknown> value node
 
+if sys.version_info.minor < 9:
+    import astunparse
 
-def unparse(*args, **kwargs):
-    import sys
-
-    if sys.version_info.minor < 9:
-        import astunparse
-
+    def unparse(*args, **kwargs):
         return astunparse.unparse(*args, **kwargs)
-    else:
+else:
+
+    def unparse(*args, **kwargs):
         return ast.unparse(*args, **kwargs)
 
 
@@ -26,12 +26,15 @@ def __init__(self, *, var_names: List[str]):
         self._var_names = var_names
 
     def transform(self, expr: str) -> Tuple[str, bool]:
-        node = ast.parse(expr, mode='eval')
-        transformed = self.visit(node)
-        if transformed is Unknown:
-            return expr, False
+        if expr:
+            node = ast.parse(expr, mode='eval')
+            transformed = self.visit(node)
+            if transformed is Unknown:
+                return expr, False
+            else:
+                return unparse(transformed), True
         else:
-            return unparse(transformed), True
+            return expr, False
 
     def visit_Expression(self, node: ast.Expression) -> Any:
         node: ast.Expression = self.generic_visit(node)

From 86deb77b35928757da2e9f8b2f7246c0451c3beb Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Thu, 13 Mar 2025 15:40:04 +0400
Subject: [PATCH 14/30] [chore] Remove legacy (`aim 2.x.x`) sdk (#3305)

---
 aim/sdk/__init__.py                   |   7 --
 aim/sdk/legacy/__init__.py            |   0
 aim/sdk/legacy/deprecation_warning.py |  15 ---
 aim/sdk/legacy/flush.py               |   6 --
 aim/sdk/legacy/init.py                |   7 --
 aim/sdk/legacy/select.py              |  30 ------
 aim/sdk/legacy/session/__init__.py    |   1 -
 aim/sdk/legacy/session/configs.py     |   1 -
 aim/sdk/legacy/session/session.py     | 132 --------------------------
 aim/sdk/legacy/track.py               |  16 ----
 10 files changed, 215 deletions(-)
 delete mode 100644 aim/sdk/legacy/__init__.py
 delete mode 100644 aim/sdk/legacy/deprecation_warning.py
 delete mode 100644 aim/sdk/legacy/flush.py
 delete mode 100644 aim/sdk/legacy/init.py
 delete mode 100644 aim/sdk/legacy/select.py
 delete mode 100644 aim/sdk/legacy/session/__init__.py
 delete mode 100644 aim/sdk/legacy/session/configs.py
 delete mode 100644 aim/sdk/legacy/session/session.py
 delete mode 100644 aim/sdk/legacy/track.py

diff --git a/aim/sdk/__init__.py b/aim/sdk/__init__.py
index 17d6974a6..f7c190da1 100644
--- a/aim/sdk/__init__.py
+++ b/aim/sdk/__init__.py
@@ -1,10 +1,3 @@
-# Legacy SDK functions
-from aim.sdk.legacy.flush import flush
-from aim.sdk.legacy.init import init
-from aim.sdk.legacy.select import select_metrics, select_runs
-from aim.sdk.legacy.session import Session
-from aim.sdk.legacy.track import set_params, track
-
 # pre-defined sequences and custom objects
 from aim.sdk.objects import Audio, Distribution, Figure, Image, Text
 from aim.sdk.repo import Repo
diff --git a/aim/sdk/legacy/__init__.py b/aim/sdk/legacy/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/aim/sdk/legacy/deprecation_warning.py b/aim/sdk/legacy/deprecation_warning.py
deleted file mode 100644
index 36047509e..000000000
--- a/aim/sdk/legacy/deprecation_warning.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import logging
-
-from functools import wraps
-
-
-logger = logging.getLogger(__name__)
-
-
-def deprecated(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        logger.warning(msg=f'Usage of {func.__qualname__} is deprecated!')
-        return func(*args, **kwargs)
-
-    return wrapper
diff --git a/aim/sdk/legacy/flush.py b/aim/sdk/legacy/flush.py
deleted file mode 100644
index 74dc48c2a..000000000
--- a/aim/sdk/legacy/flush.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from aim.sdk.legacy.deprecation_warning import deprecated
-
-
-@deprecated
-def flush():
-    pass
diff --git a/aim/sdk/legacy/init.py b/aim/sdk/legacy/init.py
deleted file mode 100644
index 6456e4950..000000000
--- a/aim/sdk/legacy/init.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from aim.sdk.legacy.deprecation_warning import deprecated
-from aim.sdk.legacy.session import DefaultSession
-
-
-@deprecated
-def init(*args, **kwargs):
-    DefaultSession(*args, **kwargs)
diff --git a/aim/sdk/legacy/select.py b/aim/sdk/legacy/select.py
deleted file mode 100644
index 4b637d13f..000000000
--- a/aim/sdk/legacy/select.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Optional
-
-from aim.sdk.legacy.deprecation_warning import deprecated
-from aim.sdk.repo import Repo
-
-
-@deprecated
-def select_metrics(search_statement: str, repo_path: Optional[str] = None):
-    if repo_path is not None:
-        repo = Repo.from_path(repo_path)
-    else:
-        repo = Repo.default_repo()
-
-    if not repo:
-        return None
-
-    return repo.query_metrics(search_statement)
-
-
-@deprecated
-def select_runs(expression: Optional[str] = None, repo_path: Optional[str] = None):
-    if repo_path is not None:
-        repo = Repo.from_path(repo_path)
-    else:
-        repo = Repo.default_repo()
-
-    if not repo:
-        return None
-
-    return repo.query_runs(expression)
diff --git a/aim/sdk/legacy/session/__init__.py b/aim/sdk/legacy/session/__init__.py
deleted file mode 100644
index 6c268677a..000000000
--- a/aim/sdk/legacy/session/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from aim.sdk.legacy.session.session import DefaultSession, Session
diff --git a/aim/sdk/legacy/session/configs.py b/aim/sdk/legacy/session/configs.py
deleted file mode 100644
index 08db0117c..000000000
--- a/aim/sdk/legacy/session/configs.py
+++ /dev/null
@@ -1 +0,0 @@
-DEFAULT_FLUSH_FREQUENCY = 128
diff --git a/aim/sdk/legacy/session/session.py b/aim/sdk/legacy/session/session.py
deleted file mode 100644
index cc77b9744..000000000
--- a/aim/sdk/legacy/session/session.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import atexit
-import os
-import signal
-import threading
-
-from typing import Optional
-
-from aim.ext.exception_resistant import exception_resistant
-from aim.ext.resource.configs import DEFAULT_SYSTEM_TRACKING_INT
-from aim.sdk.legacy.deprecation_warning import deprecated
-from aim.sdk.repo import Repo
-from aim.sdk.run import Run
-
-
-class Session:
-    sessions = {}
-
-    _are_exit_listeners_set = False
-    _original_sigint_handler = None
-    _original_sigterm_handler = None
-
-    @deprecated
-    def __init__(
-        self,
-        repo: Optional[str] = None,
-        experiment: Optional[str] = None,
-        flush_frequency: int = 0,  # unused
-        block_termination: bool = True,  # unused
-        run: Optional[str] = None,
-        system_tracking_interval: Optional[int] = DEFAULT_SYSTEM_TRACKING_INT,
-    ):
-        self._repo = Repo.from_path(repo) if repo else Repo.default_repo()
-        self._repo_path = self._repo.path
-        self._run = Run(run, repo=self._repo, experiment=experiment, system_tracking_interval=system_tracking_interval)
-        self._run_hash = self._run.hash
-        self.active = True
-
-        Session.sessions.setdefault(self._repo_path, [])
-        Session.sessions[self._repo_path].append(self)
-
-        # Bind signal listeners
-        self._set_exit_handlers()
-
-    @property
-    def run_hash(self):
-        return self._run_hash
-
-    @property
-    def repo_path(self):
-        return self._repo_path
-
-    @exception_resistant(silent=False)
-    def track(self, *args, **kwargs):
-        val = args[0]
-        name = kwargs.pop('name')
-        step = kwargs.pop('step', None)
-        epoch = kwargs.pop('epoch', None)
-        for key in kwargs.keys():
-            if key.startswith('__'):
-                del kwargs[key]
-
-        self._run.track(val, name=name, step=step, epoch=epoch, context=kwargs)
-
-    @exception_resistant(silent=False)
-    def set_params(self, params: dict, name: Optional[str] = None):
-        if name is None:
-            self._run[...] = params
-        else:
-            self._run[name] = params
-
-    def flush(self):
-        pass
-
-    @exception_resistant(silent=False)
-    def close(self):
-        if not self.active:
-            raise Exception('session is closed')
-        if self._run:
-            del self._run
-            self._run = None
-        if self._repo_path in Session.sessions and self in Session.sessions[self._repo_path]:
-            Session.sessions[self._repo_path].remove(self)
-            if len(Session.sessions[self._repo_path]) == 0:
-                del Session.sessions[self._repo_path]
-        self.active = False
-
-    @classmethod
-    def _close_sessions(cls, *args, **kwargs):
-        threads = []
-        for _, sessions in cls.sessions.items():
-            for session in sessions:
-                th = threading.Thread(target=session.close)
-                th.daemon = True
-                threads.append(th)
-
-        for th in threads:
-            th.start()
-
-        for th in threads:
-            th.join()
-
-        if len(args):
-            if args[0] == 15:
-                signal.signal(signal.SIGTERM, cls._original_sigterm_handler)
-                os.kill(os.getpid(), 15)
-            # elif args[0] == 2:
-            #     signal.signal(signal.SIGINT, cls._original_sigint_handler)
-            #     os.kill(os.getpid(), 2)
-
-    @classmethod
-    def _set_exit_handlers(cls):
-        if not cls._are_exit_listeners_set:
-            cls._are_exit_listeners_set = True
-            # cls._original_sigint_handler = signal.getsignal(signal.SIGINT)
-            cls._original_sigterm_handler = signal.getsignal(signal.SIGTERM)
-
-            atexit.register(cls._close_sessions)
-            # signal.signal(signal.SIGINT, cls._close_sessions)
-            signal.signal(signal.SIGTERM, cls._close_sessions)
-
-
-DefaultSession = Session
-
-
-def get_default_session() -> Session:
-    if len(Session.sessions.keys()) > 0:
-        default_sess_key = list(Session.sessions.keys())[0]
-        if len(Session.sessions[default_sess_key]) > 0:
-            return Session.sessions[default_sess_key][0]
-
-    # Create and return default session otherwise
-    return DefaultSession()
diff --git a/aim/sdk/legacy/track.py b/aim/sdk/legacy/track.py
deleted file mode 100644
index ef04f2bcc..000000000
--- a/aim/sdk/legacy/track.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from typing import Optional
-
-from aim.sdk.legacy.deprecation_warning import deprecated
-from aim.sdk.legacy.session.session import get_default_session
-
-
-@deprecated
-def track(*args, **kwargs):
-    sess = get_default_session()
-    return sess.track(*args, **kwargs)
-
-
-@deprecated
-def set_params(params: dict, name: Optional[str] = None):
-    sess = get_default_session()
-    return sess.set_params(params, name)

From 795067ce475a67fd62ef360efc69515477dbf8d8 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Thu, 13 Mar 2025 15:42:47 +0400
Subject: [PATCH 15/30] [fix] Improve error messages for remote tracking
 (#3303)

---
 CHANGELOG.md                       | 3 ++-
 aim/ext/transport/message_utils.py | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7dcc243e4..9c267f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,12 @@
 - Fix connection of data points on epoch alignment (mihran113)
 - Resolve issue with adding duplicate tags to the same run (mihran113)
 - Resolve session refresh issues when db file is replaced (mihran113)
+- Improve error messages for remote tracking server (mihran113)
+- Fix spurious assertion error in message stream parsing (qzed)
 
 ### Enhancements:
 - Skip metrics check when run is known to yield false result (alberttorosyan)
 - Correct indentation on query proxy object return statement (alberttorosyan)
-- Fix spurious assertion error in message stream parsing (qzed)
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/ext/transport/message_utils.py b/aim/ext/transport/message_utils.py
index cc0422e5a..20f9c717f 100644
--- a/aim/ext/transport/message_utils.py
+++ b/aim/ext/transport/message_utils.py
@@ -55,7 +55,9 @@ def raise_exception(server_exception):
     module = importlib.import_module(server_exception.get('module_name'))
     exception = getattr(module, server_exception.get('class_name'))
     args = json.loads(server_exception.get('args') or [])
-    raise exception(*args) if args else exception()
+    message = server_exception.get('message')
+
+    raise exception(*args) if args else Exception(message)
 
 
 def build_exception(exception: Exception):
@@ -63,6 +65,7 @@ def build_exception(exception: Exception):
         'module_name': exception.__class__.__module__,
         'class_name': exception.__class__.__name__,
         'args': json.dumps(exception.args),
+        'message': str(exception),
     }
 
 

From 5bafebb91aaed9ec541a8774a8105329d16df59b Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis
 <43679502+VassilisVassiliadis@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:54:28 +0000
Subject: [PATCH 16/30] [feat] Add AimCallback for distributed runs using the
 hugging face API (#3284)

There is a singular aim.Run which the main worker initializes and manages.
All auxiliary workers (local_rank 0 workers hosted on other nodes) collect
their metrics and forward them to the main worker. The main worker records
the metrics in AIM.

Signed-off-by: Vassilis Vassiliadis <vassilis.vassiliadis@ibm.com>
---
 aim/distributed_hugging_face.py              |   2 +
 aim/sdk/adapters/distributed_hugging_face.py | 498 +++++++++++++++++++
 2 files changed, 500 insertions(+)
 create mode 100644 aim/distributed_hugging_face.py
 create mode 100644 aim/sdk/adapters/distributed_hugging_face.py

diff --git a/aim/distributed_hugging_face.py b/aim/distributed_hugging_face.py
new file mode 100644
index 000000000..0fa836d02
--- /dev/null
+++ b/aim/distributed_hugging_face.py
@@ -0,0 +1,2 @@
+# Alias to SDK distributed hugging face interface
+from aim.sdk.adapters.distributed_hugging_face import AimCallback  # noqa F401
diff --git a/aim/sdk/adapters/distributed_hugging_face.py b/aim/sdk/adapters/distributed_hugging_face.py
new file mode 100644
index 000000000..76e5e6937
--- /dev/null
+++ b/aim/sdk/adapters/distributed_hugging_face.py
@@ -0,0 +1,498 @@
+import os
+
+from aim.ext.resource.stat import Stat
+
+try:
+    import accelerate.utils.environment
+except ImportError:
+    raise RuntimeError(
+        "This contrib module requires HuggingFace Accelerate to be installed. "
+        "Please install it with command: \n pip install accelerate"
+    )
+
+import copy
+import struct
+import threading
+import time
+
+import aim
+import aim.hugging_face
+import aim.ext.resource
+import aim.sdk.configs
+
+import typing
+import socket
+import select
+import logging
+import json
+
+
+class IncompletePackageError(Exception):
+    pass
+
+
+class IncompleteHeaderError(IncompletePackageError):
+    pass
+
+
+class IncompleteDataError(IncompletePackageError):
+    pass
+
+
+def packet_encode(usage: typing.Dict[str, typing.Any]) -> bytes:
+    data = json.dumps(usage)
+    header = len(data).to_bytes(4, "big")
+    packet = b"".join((header, struct.pack(f"!{len(data)}s", data.encode("utf-8"))))
+    return packet
+
+
+def packet_decode(packet: bytes) -> typing.Dict[str, typing.Any]:
+    length = int.from_bytes(packet[:4], "big")
+    raw = struct.unpack_from(f"!{length}s", packet, 4)[0]
+    decoded = json.loads(raw)
+    return decoded
+
+
+class ResourceTrackerForwarder(aim.ext.resource.ResourceTracker):
+    def _track(self, stat: Stat):
+        # Instead of tracking individual system metrics, forward the entire update to the MetricsReporter
+        # in turn, the MetricsReporter will create a packet ouf of Stat and its context (rank, world_size, etc).
+        # Next, it'll send that packet to the MetricsReceiver which will then push the data to the Aim server
+        self._tracker()(stat)
+
+
+class MetricsReporter:
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        node_rank: int,
+        rank: int,
+        interval: typing.Union[int, float],
+    ):
+        self.client: typing.Optional[socket.socket] = None
+
+        self.node_rank = node_rank
+        self.rank = rank
+        self.log = logging.getLogger(f"MetricsReporter{rank}")
+
+        self._connect(host=host, port=port)
+        self.tracker = ResourceTrackerForwarder(
+            tracker=self, interval=interval, capture_logs=False
+        )
+
+    def start(self):
+        self.tracker.start()
+
+    def stop(self):
+        if self.tracker._shutdown is False:
+            self.tracker.stop()
+        if self.client is not None:
+            self.client.close()
+        self.client = None
+
+    def _connect(
+        self,
+        host: str,
+        port: int,
+        connection_timeout: int = 60 * 10,
+        retry_seconds: int = 5,
+    ):
+        start = time.time()
+
+        while time.time() - start <= connection_timeout:
+            # This should deal with both ipv4 and ipv6 hosts
+            for family, socktype, proto, canonname, sa in socket.getaddrinfo(
+                host, port, proto=socket.SOL_TCP
+            ):
+                self.client = socket.socket(family, socktype, proto)
+                try:
+                    self.client.connect(sa)
+                    return
+                except (ConnectionRefusedError, OSError) as e:
+                    self.client.close()
+                    self.log.info(
+                        f"Could not connect to main worker due to {e} - "
+                        f"will retry in {retry_seconds} seconds"
+                    )
+            time.sleep(retry_seconds)
+
+        raise ConnectionError(
+            f"Could not connect to server {host}:{port} after {connection_timeout} seconds"
+        )
+
+    def __call__(self, stat: aim.ext.resource.tracker.Stat):
+        if self.client is None:
+            self.log.info(
+                "Connection has already closed, will not propagate this system metrics snapshot"
+            )
+            return
+
+        # This is invoked by @self.tracker
+        raw = {
+            "stat": stat.stat_item.to_dict(),
+            "worker": {
+                "node_rank": self.node_rank,
+                "rank": self.rank,
+            },
+        }
+        self.log.debug(f"Send {raw}")
+
+        packet = packet_encode(raw)
+        try:
+            self.client.sendall(packet)
+        except BrokenPipeError:
+            self.log.info(
+                f"BrokenPipeError while transmitting system metrics {raw} - will stop recording system metrics"
+            )
+            try:
+                self.stop()
+            except RuntimeError as e:
+                if e.args[0] != "cannot join current thread":
+                    # Calling stop() causes self.tracker() to try to join this thread. In turn that raises
+                    # this RuntimeError
+                    raise
+        except Exception as e:
+            self.log.info(
+                f"{e} while transmitting system metrics {raw} - will ignore exception"
+            )
+
+
+class MetricsReceiver:
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        num_workers: int,
+        connection_timeout: int,
+    ):
+        self.tracker: typing.Optional[
+            typing.Callable[
+                [
+                    typing.Dict[str, typing.Any],
+                    typing.Dict[str, typing.Any],
+                ],
+                None,
+            ]
+        ] = None
+
+        self.clients: typing.List[socket.socket] = []
+        self.log = logging.getLogger("MetricsReceiver")
+        self.server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+        self._wait_workers(
+            host=host,
+            port=port,
+            num_workers=num_workers,
+            connection_timeout=connection_timeout,
+        )
+
+        self.running = True
+        self.thread: typing.Optional[threading.Thread] = None
+
+    def start(
+        self,
+        tracker: typing.Callable[
+            [
+                typing.Dict[str, typing.Any],
+                typing.Dict[str, typing.Any],
+            ],
+            None,
+        ],
+    ):
+        self.tracker = tracker
+        self.running = True
+
+        self.thread = threading.Thread(target=self._collect_metrics, daemon=True)
+        self.thread.start()
+
+    def stop(self):
+        if self.running:
+            self.running = False
+            self.thread.join()
+
+    def _recv(self, sock: socket.socket, length: int) -> typing.Optional[bytes]:
+        data = b""
+        retries = 0
+        while len(data) < length and retries < 10:
+            buf = sock.recv(length - len(data))
+            data += buf
+            retries += 1
+
+        if len(data) < length:
+            if len(data) > 0:
+                raise IncompletePackageError()
+            # If recv() returned b'' then the client disconnected
+            return None
+
+        return data
+
+    def _recv_packet(
+        self, sock: socket.socket
+    ) -> typing.Optional[typing.Dict[str, typing.Any]]:
+        try:
+            header = self._recv(sock, 4)
+
+            if header is None:
+                # The client disconnected
+                return None
+
+            length = int.from_bytes(header, "big")
+        except IncompletePackageError:
+            raise IncompleteHeaderError()
+        try:
+            data = self._recv(sock, length)
+            if len(data) > 0:
+                return json.loads(data)
+        except IncompletePackageError:
+            raise IncompleteDataError()
+
+    def _collect_metrics(self):
+        while self.running:
+            read, _write, _error = select.select(self.clients, [], [], 5.0)
+            for client in typing.cast(typing.List[socket.socket], read):
+                try:
+                    packet = self._recv_packet(client)
+                except IncompletePackageError as e:
+                    self.log.info(
+                        f"Error {e} while receiving update - will assume this is a transient error"
+                    )
+                    continue
+
+                if packet:
+                    self.tracker(packet["stat"], packet["worker"])
+                else:
+                    self.log.info("Client disconnected")
+                    client.close()
+                    self.clients.remove(client)
+
+    def _wait_workers(
+        self, host: str, port: int, num_workers: int, connection_timeout: float
+    ):
+        # This may raise an exception, don't catch it here and let it flow to the caller
+        self.server.bind((host, port))
+        self.server.listen(num_workers)
+
+        # We're actually going to pause here, till we get all clients OR we run out of time
+
+        start = time.time()
+        self.log.info(f"Waiting for {num_workers} workers to connect")
+        # Block for 5 seconds while waiting for new connections
+        while time.time() - start <= connection_timeout:
+            read, _write, _error = select.select([self.server], [], [], 5.0)
+            for server in read:
+                client, _client_address = server.accept()
+                self.clients.append(client)
+                self.log.info(f"Client {len(self.clients)}/{num_workers} connected")
+
+                if len(self.clients) == num_workers:
+                    return
+
+        self.server.close()
+
+        raise ConnectionError(
+            f"{num_workers - len(self.clients)} out of {num_workers} total clients did not connect"
+        )
+
+
+class AimCallback(aim.hugging_face.AimCallback):
+    def __init__(
+        self,
+        main_port: int,
+        repo: typing.Optional[str] = None,
+        experiment: typing.Optional[str] = None,
+        system_tracking_interval: typing.Optional[
+            int
+        ] = aim.ext.resource.DEFAULT_SYSTEM_TRACKING_INT,
+        log_system_params: typing.Optional[bool] = True,
+        capture_terminal_logs: typing.Optional[bool] = True,
+        main_addr: typing.Optional[str] = None,
+        distributed_information: typing.Optional[
+            accelerate.utils.environment.CPUInformation
+        ] = None,
+        connection_timeout: int = 60 * 5,
+        workers_only_on_rank_0: bool = True,
+    ):
+        """A HuggingFace TrainerCallback which registers the system metrics of all workers involved in the training
+        under a single Aim run.
+
+        This code initializes aim.hugging_face.AimCallback() only on rank 0 - otherwise we'd end up with multiple
+        Aim runs.
+
+        Args:
+            main_port (:obj:`int`): Configures the port that the main worker will listen on. If this is None
+                then the code will raise an exception.
+            repo (:obj:`Union[Repo,str]`, optional): Aim repository path or Repo object to which Run object is bound.
+                If skipped, default Repo is used.
+            experiment (:obj:`str`, optional): Sets Run's `experiment` property. 'default' if not specified.
+                Can be used later to query runs/sequences.
+            system_tracking_interval (:obj:`int`, optional): Sets the tracking interval in seconds for system usage
+                metrics (CPU, Memory, etc.). Set to `None` to disable system metrics tracking.
+            log_system_params (:obj:`bool`, optional): Enable/Disable logging of system params such as installed
+                packages, git info, environment variables, etc.
+            main_addr (:obj:`str`, optional): The address of the main worker. If this is None then the code will
+                auto-discover it from the environment variable MASTER_ADDR. If this parameter cannot be resolved
+                to a non-empty value the method will raise an exception.
+            distributed_information (:obj:`str`, accelerate.utils.environment.CPUInformation): information about the
+                CPU in a distributed environment. If None, the code parses environment variables to auto create it.
+                See accelerate.utils.get_cpu_distributed_information() for more details
+            connection_timeout (:obj:`int`, optional): Maximum seconds to wait for the auxiliary workers to connect.
+                workers_only_on_rank_0 (:obj:`bool`): When set to true, only treat processes with local_rank 0 as
+                workers. Setting this to False, only makes sense when debugging the AimCallback() code.
+
+        Raises:
+            ConnectionError:
+                If unable auxiliary workers are unable to connect to main worker
+        """
+        if main_addr is None:
+            main_addr = os.environ.get("MASTER_ADDR")
+
+        if not main_addr:
+            raise ValueError("main_addr cannot be empty")
+
+        if not main_port or main_port <0:
+            raise ValueError("main_port must be a positive number")
+
+        if distributed_information is None:
+            distributed_information = accelerate.utils.get_cpu_distributed_information()
+
+        self.distributed_information = distributed_information
+        self.connection_timeout = connection_timeout
+
+        self.listening_socket: typing.Optional[socket.socket] = None
+
+        self.metrics_reporter: typing.Optional[MetricsReporter] = None
+        self.metrics_receiver: typing.Optional[MetricsReceiver] = None
+
+        self._run: typing.Optional[aim.Run] = None
+        self.log = logging.getLogger("CustomAimCallback")
+
+        if not workers_only_on_rank_0:
+            # This is primarily for debugging. It enables the creation of multiple auxiliary workers on a single node
+            auxiliary_workers = self.distributed_information.world_size
+        else:
+            auxiliary_workers = (
+                self.distributed_information.world_size
+                // self.distributed_information.local_world_size
+            )
+
+        # Instantiate a MetricsReporter for all workers which are not rank 0
+        if (
+            self.distributed_information.rank is not None
+            and self.distributed_information.rank > 0
+            and (
+                not workers_only_on_rank_0
+                or self.distributed_information.local_rank == 0
+            )
+            and system_tracking_interval is not None
+        ):
+            if workers_only_on_rank_0:
+                node_rank = (
+                    distributed_information.rank
+                    // distributed_information.local_world_size
+                )
+            else:
+                node_rank = distributed_information.rank
+
+            self.metrics_reporter = MetricsReporter(
+                host=main_addr,
+                port=main_port,
+                rank=self.distributed_information.rank,
+                node_rank=node_rank,
+                interval=system_tracking_interval,
+            )
+
+            self.log.info(
+                f"Distributed worker {self.distributed_information.rank} connected"
+            )
+        elif self.distributed_information.rank == 0:
+            # When running as the main worker, we initialize aim as usual. If there're multiple
+            # auxiliary workers, we also start a listening server. The auxiliary workers will connect
+            # to this server and periodically send over their system metrics
+            super().__init__(
+                repo,
+                experiment,
+                system_tracking_interval,
+                log_system_params,
+                capture_terminal_logs,
+            )
+
+            if (
+                auxiliary_workers > 1
+                and main_port is not None
+                and system_tracking_interval is not None
+            ):
+                self.log.info(f"There are {auxiliary_workers} workers")
+
+                self.metrics_receiver = MetricsReceiver(
+                    # Bind to 0.0.0.0 so that we can accept connections coming in from any interface
+                    host="0.0.0.0",
+                    port=main_port,
+                    num_workers=auxiliary_workers - 1,
+                    connection_timeout=self.connection_timeout,
+                )
+
+                self.metrics_receiver.start(self._push_auxiliary_worker_metrics)
+
+    def _push_auxiliary_worker_metrics(
+        self,
+        stat: typing.Dict[str, typing.Any],
+        worker_info: typing.Dict[str, typing.Any],
+    ):
+        """Utility method which pushes the system metrics of an auxiliary worker to Aim
+
+        Args:
+            stat: (:obj:`typing.Dict[str, typing.Any]`): A dictionary representation of
+                aim.ext.resource.stat.Stat
+            worker_info (:obj:`typing.Dict[str, typing.Any]`): A dictionary which represents the context of a
+                worker. For example, it can contain the fields {"rank": int, "node_rank": int}
+        """
+        # TODO: Investigate whether it's better to spin up a dedicated RunTracker here or not
+        if self._run is None:
+            self.log.info(
+                f"The aim Run is inactive, will not register these metrics from {worker_info}"
+            )
+            return
+
+        tracker = self._run._tracker
+        context = copy.deepcopy(worker_info)
+
+        for resource, usage in stat["system"].items():
+            tracker(
+                usage,
+                name="{}{}".format(
+                    aim.ext.resource.configs.AIM_RESOURCE_METRIC_PREFIX,
+                    resource,
+                ),
+                context=context,
+            )
+
+        # Store GPU stats
+        for gpu_idx, gpu in enumerate(stat["gpus"]):
+            for resource, usage in gpu.items():
+                context = copy.deepcopy(worker_info)
+                context.update({"gpu": gpu_idx})
+
+                tracker(
+                    usage,
+                    name="{}{}".format(
+                        aim.ext.resource.configs.AIM_RESOURCE_METRIC_PREFIX,
+                        resource,
+                    ),
+                    context=context,
+                )
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        super().on_train_begin(args, state, control, model, **kwargs)
+
+        if self.metrics_reporter:
+            self.metrics_reporter.start()
+
+    def close(self):
+        try:
+            super().close()
+        finally:
+            if self.metrics_receiver is not None:
+                self.metrics_receiver.stop()
+            if self.metrics_reporter is not None:
+                self.metrics_reporter.stop()

From c57f4a899672ab76f36b646ddecc64146a722b40 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Fri, 14 Mar 2025 18:16:39 +0400
Subject: [PATCH 17/30] [fix] Increase session pool size for sqlite engine
 (#3306)

---
 CHANGELOG.md                 |  1 -
 aim/storage/structured/db.py |  9 ++++++---
 aim/web/api/db.py            | 10 ++++++----
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c267f09e..f44412ac9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,6 @@
 - Decrease client resources keep-alive time (mihran113)
 - Fix connection of data points on epoch alignment (mihran113)
 - Resolve issue with adding duplicate tags to the same run (mihran113)
-- Resolve session refresh issues when db file is replaced (mihran113)
 - Improve error messages for remote tracking server (mihran113)
 - Fix spurious assertion error in message stream parsing (qzed)
 
diff --git a/aim/storage/structured/db.py b/aim/storage/structured/db.py
index 2849b3be4..830c0bc41 100644
--- a/aim/storage/structured/db.py
+++ b/aim/storage/structured/db.py
@@ -61,8 +61,12 @@ def __init__(self, path: str, readonly: bool = False):
         self.db_url = self.get_db_url(path)
         self.readonly = readonly
         self.engine = create_engine(
-            self.db_url, echo=(logging.INFO >= int(os.environ.get(AIM_LOG_LEVEL_KEY, logging.WARNING)))
+            self.db_url,
+            echo=(logging.INFO >= int(os.environ.get(AIM_LOG_LEVEL_KEY, logging.WARNING))),
+            pool_size=10,
+            max_overflow=20,
         )
+        self.session_cls = scoped_session(sessionmaker(autoflush=False, bind=self.engine))
         self._upgraded = None
 
     @classmethod
@@ -90,8 +94,7 @@ def caches(self):
         return self._caches
 
     def get_session(self, autocommit=True):
-        session_cls = scoped_session(sessionmaker(autoflush=False, bind=self.engine))
-        session = session_cls()
+        session = self.session_cls()
         setattr(session, 'autocommit', autocommit)
         return session
 
diff --git a/aim/web/api/db.py b/aim/web/api/db.py
index 80aeaa539..c38e2598f 100644
--- a/aim/web/api/db.py
+++ b/aim/web/api/db.py
@@ -14,13 +14,16 @@
     get_db_url(),
     echo=(logging.INFO >= int(os.environ.get(AIM_LOG_LEVEL_KEY, logging.WARNING))),
     connect_args={'check_same_thread': False},
+    pool_size=10,
+    max_overflow=20,
 )
+
+SessionLocal = sessionmaker(autoflush=False, bind=engine)
 Base = declarative_base()
 
 
 def get_session():
-    session_cls = sessionmaker(autoflush=False, bind=engine)
-    session = session_cls()
+    session = SessionLocal()
     try:
         yield session
     finally:
@@ -29,8 +32,7 @@ def get_session():
 
 @contextmanager
 def get_contexted_session():
-    session_cls = sessionmaker(autoflush=False, bind=engine)
-    session = session_cls()
+    session = SessionLocal()
     try:
         yield session
     finally:

From f731d3e4597b1f15296ceba2df3fd60d11d3e5b5 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Tue, 18 Mar 2025 14:43:59 +0400
Subject: [PATCH 18/30] [feat] Remove metric version check to improve metric
 retrieval performance (#3307)

---
 CHANGELOG.md                                  |   1 +
 aim/cli/up/commands.py                        |   2 +-
 aim/distributed_hugging_face.py               |   2 +-
 aim/sdk/adapters/distributed_hugging_face.py  | 156 +++++++-----------
 aim/sdk/run.py                                |   8 -
 aim/storage/encoding/encoding.pyx             |   2 +-
 .../RunOverviewTab/RunOverviewTab.tsx         |   7 -
 7 files changed, 62 insertions(+), 116 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f44412ac9..0436f92e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 ### Enhancements:
 - Skip metrics check when run is known to yield false result (alberttorosyan)
 - Correct indentation on query proxy object return statement (alberttorosyan)
+- Remove metric version check to improve performance of metric retrieval (mihran113)
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index de8ed008d..aafd73b2b 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -29,7 +29,7 @@
 @click.command()
 @click.option('-h', '--host', default=AIM_UI_DEFAULT_HOST, type=str)
 @click.option('-p', '--port', default=AIM_UI_DEFAULT_PORT, type=int)
-@click.option('-w', '--workers', default=1, type=int)
+@click.option('-w', '--workers', default=2, type=int)
 @click.option('--uds', required=False, type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True))
 @click.option('--repo', required=False, type=click.Path(exists=True, file_okay=False, dir_okay=True, writable=True))
 @click.option('--tf_logs', type=click.Path(exists=True, readable=True))
diff --git a/aim/distributed_hugging_face.py b/aim/distributed_hugging_face.py
index 0fa836d02..cbb9f8eec 100644
--- a/aim/distributed_hugging_face.py
+++ b/aim/distributed_hugging_face.py
@@ -1,2 +1,2 @@
 # Alias to SDK distributed hugging face interface
-from aim.sdk.adapters.distributed_hugging_face import AimCallback  # noqa F401
+from aim.sdk.adapters.distributed_hugging_face import AimCallback  # noqa: F401
diff --git a/aim/sdk/adapters/distributed_hugging_face.py b/aim/sdk/adapters/distributed_hugging_face.py
index 76e5e6937..561bef82d 100644
--- a/aim/sdk/adapters/distributed_hugging_face.py
+++ b/aim/sdk/adapters/distributed_hugging_face.py
@@ -2,30 +2,30 @@
 
 from aim.ext.resource.stat import Stat
 
+
 try:
     import accelerate.utils.environment
 except ImportError:
     raise RuntimeError(
-        "This contrib module requires HuggingFace Accelerate to be installed. "
-        "Please install it with command: \n pip install accelerate"
+        'This contrib module requires HuggingFace Accelerate to be installed. '
+        'Please install it with command: \n pip install accelerate'
     )
 
 import copy
+import json
+import logging
+import select
+import socket
 import struct
 import threading
 import time
+import typing
 
 import aim
-import aim.hugging_face
 import aim.ext.resource
+import aim.hugging_face
 import aim.sdk.configs
 
-import typing
-import socket
-import select
-import logging
-import json
-
 
 class IncompletePackageError(Exception):
     pass
@@ -41,14 +41,14 @@ class IncompleteDataError(IncompletePackageError):
 
 def packet_encode(usage: typing.Dict[str, typing.Any]) -> bytes:
     data = json.dumps(usage)
-    header = len(data).to_bytes(4, "big")
-    packet = b"".join((header, struct.pack(f"!{len(data)}s", data.encode("utf-8"))))
+    header = len(data).to_bytes(4, 'big')
+    packet = b''.join((header, struct.pack(f'!{len(data)}s', data.encode('utf-8'))))
     return packet
 
 
 def packet_decode(packet: bytes) -> typing.Dict[str, typing.Any]:
-    length = int.from_bytes(packet[:4], "big")
-    raw = struct.unpack_from(f"!{length}s", packet, 4)[0]
+    length = int.from_bytes(packet[:4], 'big')
+    raw = struct.unpack_from(f'!{length}s', packet, 4)[0]
     decoded = json.loads(raw)
     return decoded
 
@@ -74,12 +74,10 @@ def __init__(
 
         self.node_rank = node_rank
         self.rank = rank
-        self.log = logging.getLogger(f"MetricsReporter{rank}")
+        self.log = logging.getLogger(f'MetricsReporter{rank}')
 
         self._connect(host=host, port=port)
-        self.tracker = ResourceTrackerForwarder(
-            tracker=self, interval=interval, capture_logs=False
-        )
+        self.tracker = ResourceTrackerForwarder(tracker=self, interval=interval, capture_logs=False)
 
     def start(self):
         self.tracker.start()
@@ -102,9 +100,7 @@ def _connect(
 
         while time.time() - start <= connection_timeout:
             # This should deal with both ipv4 and ipv6 hosts
-            for family, socktype, proto, canonname, sa in socket.getaddrinfo(
-                host, port, proto=socket.SOL_TCP
-            ):
+            for family, socktype, proto, canonname, sa in socket.getaddrinfo(host, port, proto=socket.SOL_TCP):
                 self.client = socket.socket(family, socktype, proto)
                 try:
                     self.client.connect(sa)
@@ -112,50 +108,43 @@ def _connect(
                 except (ConnectionRefusedError, OSError) as e:
                     self.client.close()
                     self.log.info(
-                        f"Could not connect to main worker due to {e} - "
-                        f"will retry in {retry_seconds} seconds"
+                        f'Could not connect to main worker due to {e} - will retry in {retry_seconds} seconds'
                     )
             time.sleep(retry_seconds)
 
-        raise ConnectionError(
-            f"Could not connect to server {host}:{port} after {connection_timeout} seconds"
-        )
+        raise ConnectionError(f'Could not connect to server {host}:{port} after {connection_timeout} seconds')
 
     def __call__(self, stat: aim.ext.resource.tracker.Stat):
         if self.client is None:
-            self.log.info(
-                "Connection has already closed, will not propagate this system metrics snapshot"
-            )
+            self.log.info('Connection has already closed, will not propagate this system metrics snapshot')
             return
 
         # This is invoked by @self.tracker
         raw = {
-            "stat": stat.stat_item.to_dict(),
-            "worker": {
-                "node_rank": self.node_rank,
-                "rank": self.rank,
+            'stat': stat.stat_item.to_dict(),
+            'worker': {
+                'node_rank': self.node_rank,
+                'rank': self.rank,
             },
         }
-        self.log.debug(f"Send {raw}")
+        self.log.debug(f'Send {raw}')
 
         packet = packet_encode(raw)
         try:
             self.client.sendall(packet)
         except BrokenPipeError:
             self.log.info(
-                f"BrokenPipeError while transmitting system metrics {raw} - will stop recording system metrics"
+                f'BrokenPipeError while transmitting system metrics {raw} - will stop recording system metrics'
             )
             try:
                 self.stop()
             except RuntimeError as e:
-                if e.args[0] != "cannot join current thread":
+                if e.args[0] != 'cannot join current thread':
                     # Calling stop() causes self.tracker() to try to join this thread. In turn that raises
                     # this RuntimeError
                     raise
         except Exception as e:
-            self.log.info(
-                f"{e} while transmitting system metrics {raw} - will ignore exception"
-            )
+            self.log.info(f'{e} while transmitting system metrics {raw} - will ignore exception')
 
 
 class MetricsReceiver:
@@ -177,7 +166,7 @@ def __init__(
         ] = None
 
         self.clients: typing.List[socket.socket] = []
-        self.log = logging.getLogger("MetricsReceiver")
+        self.log = logging.getLogger('MetricsReceiver')
         self.server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 
         self._wait_workers(
@@ -212,7 +201,7 @@ def stop(self):
             self.thread.join()
 
     def _recv(self, sock: socket.socket, length: int) -> typing.Optional[bytes]:
-        data = b""
+        data = b''
         retries = 0
         while len(data) < length and retries < 10:
             buf = sock.recv(length - len(data))
@@ -227,9 +216,7 @@ def _recv(self, sock: socket.socket, length: int) -> typing.Optional[bytes]:
 
         return data
 
-    def _recv_packet(
-        self, sock: socket.socket
-    ) -> typing.Optional[typing.Dict[str, typing.Any]]:
+    def _recv_packet(self, sock: socket.socket) -> typing.Optional[typing.Dict[str, typing.Any]]:
         try:
             header = self._recv(sock, 4)
 
@@ -237,7 +224,7 @@ def _recv_packet(
                 # The client disconnected
                 return None
 
-            length = int.from_bytes(header, "big")
+            length = int.from_bytes(header, 'big')
         except IncompletePackageError:
             raise IncompleteHeaderError()
         try:
@@ -254,21 +241,17 @@ def _collect_metrics(self):
                 try:
                     packet = self._recv_packet(client)
                 except IncompletePackageError as e:
-                    self.log.info(
-                        f"Error {e} while receiving update - will assume this is a transient error"
-                    )
+                    self.log.info(f'Error {e} while receiving update - will assume this is a transient error')
                     continue
 
                 if packet:
-                    self.tracker(packet["stat"], packet["worker"])
+                    self.tracker(packet['stat'], packet['worker'])
                 else:
-                    self.log.info("Client disconnected")
+                    self.log.info('Client disconnected')
                     client.close()
                     self.clients.remove(client)
 
-    def _wait_workers(
-        self, host: str, port: int, num_workers: int, connection_timeout: float
-    ):
+    def _wait_workers(self, host: str, port: int, num_workers: int, connection_timeout: float):
         # This may raise an exception, don't catch it here and let it flow to the caller
         self.server.bind((host, port))
         self.server.listen(num_workers)
@@ -276,23 +259,21 @@ def _wait_workers(
         # We're actually going to pause here, till we get all clients OR we run out of time
 
         start = time.time()
-        self.log.info(f"Waiting for {num_workers} workers to connect")
+        self.log.info(f'Waiting for {num_workers} workers to connect')
         # Block for 5 seconds while waiting for new connections
         while time.time() - start <= connection_timeout:
             read, _write, _error = select.select([self.server], [], [], 5.0)
             for server in read:
                 client, _client_address = server.accept()
                 self.clients.append(client)
-                self.log.info(f"Client {len(self.clients)}/{num_workers} connected")
+                self.log.info(f'Client {len(self.clients)}/{num_workers} connected')
 
                 if len(self.clients) == num_workers:
                     return
 
         self.server.close()
 
-        raise ConnectionError(
-            f"{num_workers - len(self.clients)} out of {num_workers} total clients did not connect"
-        )
+        raise ConnectionError(f'{num_workers - len(self.clients)} out of {num_workers} total clients did not connect')
 
 
 class AimCallback(aim.hugging_face.AimCallback):
@@ -301,15 +282,11 @@ def __init__(
         main_port: int,
         repo: typing.Optional[str] = None,
         experiment: typing.Optional[str] = None,
-        system_tracking_interval: typing.Optional[
-            int
-        ] = aim.ext.resource.DEFAULT_SYSTEM_TRACKING_INT,
+        system_tracking_interval: typing.Optional[int] = aim.ext.resource.DEFAULT_SYSTEM_TRACKING_INT,
         log_system_params: typing.Optional[bool] = True,
         capture_terminal_logs: typing.Optional[bool] = True,
         main_addr: typing.Optional[str] = None,
-        distributed_information: typing.Optional[
-            accelerate.utils.environment.CPUInformation
-        ] = None,
+        distributed_information: typing.Optional[accelerate.utils.environment.CPUInformation] = None,
         connection_timeout: int = 60 * 5,
         workers_only_on_rank_0: bool = True,
     ):
@@ -345,13 +322,13 @@ def __init__(
                 If unable auxiliary workers are unable to connect to main worker
         """
         if main_addr is None:
-            main_addr = os.environ.get("MASTER_ADDR")
+            main_addr = os.environ.get('MASTER_ADDR')
 
         if not main_addr:
-            raise ValueError("main_addr cannot be empty")
+            raise ValueError('main_addr cannot be empty')
 
-        if not main_port or main_port <0:
-            raise ValueError("main_port must be a positive number")
+        if not main_port or main_port < 0:
+            raise ValueError('main_port must be a positive number')
 
         if distributed_information is None:
             distributed_information = accelerate.utils.get_cpu_distributed_information()
@@ -365,32 +342,23 @@ def __init__(
         self.metrics_receiver: typing.Optional[MetricsReceiver] = None
 
         self._run: typing.Optional[aim.Run] = None
-        self.log = logging.getLogger("CustomAimCallback")
+        self.log = logging.getLogger('CustomAimCallback')
 
         if not workers_only_on_rank_0:
             # This is primarily for debugging. It enables the creation of multiple auxiliary workers on a single node
             auxiliary_workers = self.distributed_information.world_size
         else:
-            auxiliary_workers = (
-                self.distributed_information.world_size
-                // self.distributed_information.local_world_size
-            )
+            auxiliary_workers = self.distributed_information.world_size // self.distributed_information.local_world_size
 
         # Instantiate a MetricsReporter for all workers which are not rank 0
         if (
             self.distributed_information.rank is not None
             and self.distributed_information.rank > 0
-            and (
-                not workers_only_on_rank_0
-                or self.distributed_information.local_rank == 0
-            )
+            and (not workers_only_on_rank_0 or self.distributed_information.local_rank == 0)
             and system_tracking_interval is not None
         ):
             if workers_only_on_rank_0:
-                node_rank = (
-                    distributed_information.rank
-                    // distributed_information.local_world_size
-                )
+                node_rank = distributed_information.rank // distributed_information.local_world_size
             else:
                 node_rank = distributed_information.rank
 
@@ -402,9 +370,7 @@ def __init__(
                 interval=system_tracking_interval,
             )
 
-            self.log.info(
-                f"Distributed worker {self.distributed_information.rank} connected"
-            )
+            self.log.info(f'Distributed worker {self.distributed_information.rank} connected')
         elif self.distributed_information.rank == 0:
             # When running as the main worker, we initialize aim as usual. If there're multiple
             # auxiliary workers, we also start a listening server. The auxiliary workers will connect
@@ -417,16 +383,12 @@ def __init__(
                 capture_terminal_logs,
             )
 
-            if (
-                auxiliary_workers > 1
-                and main_port is not None
-                and system_tracking_interval is not None
-            ):
-                self.log.info(f"There are {auxiliary_workers} workers")
+            if auxiliary_workers > 1 and main_port is not None and system_tracking_interval is not None:
+                self.log.info(f'There are {auxiliary_workers} workers')
 
                 self.metrics_receiver = MetricsReceiver(
                     # Bind to 0.0.0.0 so that we can accept connections coming in from any interface
-                    host="0.0.0.0",
+                    host='0.0.0.0',
                     port=main_port,
                     num_workers=auxiliary_workers - 1,
                     connection_timeout=self.connection_timeout,
@@ -449,18 +411,16 @@ def _push_auxiliary_worker_metrics(
         """
         # TODO: Investigate whether it's better to spin up a dedicated RunTracker here or not
         if self._run is None:
-            self.log.info(
-                f"The aim Run is inactive, will not register these metrics from {worker_info}"
-            )
+            self.log.info(f'The aim Run is inactive, will not register these metrics from {worker_info}')
             return
 
         tracker = self._run._tracker
         context = copy.deepcopy(worker_info)
 
-        for resource, usage in stat["system"].items():
+        for resource, usage in stat['system'].items():
             tracker(
                 usage,
-                name="{}{}".format(
+                name='{}{}'.format(
                     aim.ext.resource.configs.AIM_RESOURCE_METRIC_PREFIX,
                     resource,
                 ),
@@ -468,14 +428,14 @@ def _push_auxiliary_worker_metrics(
             )
 
         # Store GPU stats
-        for gpu_idx, gpu in enumerate(stat["gpus"]):
+        for gpu_idx, gpu in enumerate(stat['gpus']):
             for resource, usage in gpu.items():
                 context = copy.deepcopy(worker_info)
-                context.update({"gpu": gpu_idx})
+                context.update({'gpu': gpu_idx})
 
                 tracker(
                     usage,
-                    name="{}{}".format(
+                    name='{}{}'.format(
                         aim.ext.resource.configs.AIM_RESOURCE_METRIC_PREFIX,
                         resource,
                     ),
diff --git a/aim/sdk/run.py b/aim/sdk/run.py
index 59bc4d806..775aed973 100644
--- a/aim/sdk/run.py
+++ b/aim/sdk/run.py
@@ -567,14 +567,6 @@ def get_metric(self, name: str, context: Context) -> Optional['Metric']:
         Returns:
             :obj:`Metric` object if exists, `None` otherwise.
         """
-        if self.read_only and not Run._metric_version_warning_shown:
-            if self.check_metrics_version():
-                logger.warning(
-                    f'Detected sub-optimal format metrics for Run {self.hash}. Consider upgrading repo '
-                    f'to improve queries performance:'
-                )
-                logger.warning(f"aim storage --repo {self.repo.path} upgrade 3.11+ '*'")
-                Run._metric_version_warning_shown = True
 
         return self._get_sequence('metric', name, context)
 
diff --git a/aim/storage/encoding/encoding.pyx b/aim/storage/encoding/encoding.pyx
index 71f2ca40b..308627e01 100644
--- a/aim/storage/encoding/encoding.pyx
+++ b/aim/storage/encoding/encoding.pyx
@@ -22,7 +22,7 @@ from aim.storage.encoding.encoding_native cimport (
     decode_double,
     decode_utf_8_str,
 )
-from aim.storage.encoding.encoding_native cimport decode_path  # noqa F401
+from aim.storage.encoding.encoding_native cimport decode_path  # noqa: F401
 from aim.storage.utils import ArrayFlagType, ObjectFlagType, CustomObjectFlagType
 from aim.storage.utils import ArrayFlag, ObjectFlag
 from aim.storage.container import ContainerValue
diff --git a/aim/web/ui/src/pages/RunDetail/RunOverviewTab/RunOverviewTab.tsx b/aim/web/ui/src/pages/RunDetail/RunOverviewTab/RunOverviewTab.tsx
index 1cd4bb89d..24b28e31f 100644
--- a/aim/web/ui/src/pages/RunDetail/RunOverviewTab/RunOverviewTab.tsx
+++ b/aim/web/ui/src/pages/RunDetail/RunOverviewTab/RunOverviewTab.tsx
@@ -8,8 +8,6 @@ import { ANALYTICS_EVENT_KEYS } from 'config/analytics/analyticsKeysMap';
 
 import * as analytics from 'services/analytics';
 
-import useRunMetricsBatch from '../hooks/useRunMetricsBatch';
-
 import GitInfoCard from './components/GitInfoCard';
 import RunOverviewTabMetricsCard from './components/MetricsCard/RunOverviewTabMetricsCard';
 import RunOverviewTabArtifactsCard from './components/ArtifactsCard/RunOverviewTabArtifactsCard';
@@ -28,11 +26,6 @@ function RunOverviewTab({ runData, runHash }: IRunOverviewTabProps) {
   const overviewSectionContentRef = React.useRef<HTMLElement | any>(null);
   const [containerHeight, setContainerHeight] = React.useState<number>(0);
 
-  useRunMetricsBatch({
-    runTraces: runData.runTraces,
-    runHash,
-  });
-
   React.useEffect(() => {
     analytics.pageView(
       ANALYTICS_EVENT_KEYS.runDetails.tabs['overview'].tabView,

From 51b8435c42bef21c37f381d65f1c3014785dbaf0 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Thu, 20 Mar 2025 18:38:40 +0400
Subject: [PATCH 19/30] [fix] Improve RT exception handling (#3309)

---
 aim/ext/transport/message_utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/aim/ext/transport/message_utils.py b/aim/ext/transport/message_utils.py
index 20f9c717f..ceb52fac2 100644
--- a/aim/ext/transport/message_utils.py
+++ b/aim/ext/transport/message_utils.py
@@ -52,12 +52,18 @@ def unpack_stream(stream) -> Tuple[bytes, bytes]:
 
 
 def raise_exception(server_exception):
+    from filelock import Timeout
+
     module = importlib.import_module(server_exception.get('module_name'))
     exception = getattr(module, server_exception.get('class_name'))
     args = json.loads(server_exception.get('args') or [])
     message = server_exception.get('message')
 
-    raise exception(*args) if args else Exception(message)
+    # special handling for lock timeouts as they require lock argument which can't be passed over the network
+    if exception == Timeout:
+        raise Exception(message)
+
+    raise exception(*args) if args else exception()
 
 
 def build_exception(exception: Exception):

From fba908f153484fc64a807379bdc3ec1238e8ea98 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Thu, 20 Mar 2025 19:39:53 +0400
Subject: [PATCH 20/30] [feat] Move indexing thread to `aim up` main process
 (#3311)

---
 aim/cli/up/commands.py        |  4 ++++
 aim/web/api/__init__.py       |  6 ------
 aim/web/api/projects/views.py | 11 -----------
 3 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index aafd73b2b..42a23f50c 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -11,6 +11,7 @@
     get_repo_instance,
     set_log_level,
 )
+from aim.sdk.index_manager import RepoIndexManager
 from aim.sdk.repo import Repo
 from aim.sdk.utils import clean_repo_path
 from aim.web.configs import (
@@ -122,6 +123,9 @@ def up(
     if profiler:
         os.environ[AIM_PROFILER_KEY] = '1'
 
+    index_mng = RepoIndexManager.get_index_manager(repo_inst)
+    index_mng.start_indexing_thread()
+
     try:
         server_cmd = build_uvicorn_command(
             'aim.web.run:app',
diff --git a/aim/web/api/__init__.py b/aim/web/api/__init__.py
index 52b5095b0..cb553a1e4 100644
--- a/aim/web/api/__init__.py
+++ b/aim/web/api/__init__.py
@@ -23,7 +23,6 @@ def create_app():
         max_age=86400,
     )
 
-    from aim.sdk.index_manager import RepoIndexManager
     from aim.web.api.dashboard_apps.views import dashboard_apps_router
     from aim.web.api.dashboards.views import dashboards_router
     from aim.web.api.experiments.views import experiment_router
@@ -36,11 +35,6 @@ def create_app():
     from aim.web.api.views import statics_router
     from aim.web.configs import AIM_UI_BASE_PATH
 
-    # The indexing thread has to run in the same process as the uvicorn app itself.
-    # This allows sharing state of indexing using memory instead of process synchronization methods.
-    index_mng = RepoIndexManager.get_index_manager(Project().repo)
-    index_mng.start_indexing_thread()
-
     api_app = FastAPI()
     api_app.add_middleware(GZipMiddleware, compresslevel=1)
     api_app.add_middleware(ResourceCleanupMiddleware)
diff --git a/aim/web/api/projects/views.py b/aim/web/api/projects/views.py
index c32cc5452..36856ba27 100644
--- a/aim/web/api/projects/views.py
+++ b/aim/web/api/projects/views.py
@@ -5,7 +5,6 @@
 from logging import getLogger
 from typing import Optional, Tuple
 
-from aim.sdk.index_manager import RepoIndexManager
 from aim.storage.locking import AutoFileLock
 from aim.web.api.projects.project import Project
 from aim.web.api.projects.pydantic_models import (
@@ -171,13 +170,3 @@ async def project_params_api(sequence: Optional[Tuple[str, ...]] = Query(()), ex
         }
     response.update(**project.repo.collect_sequence_info(sequence))
     return response
-
-
-@projects_router.get('/status/')
-async def project_status_api():
-    project = Project()
-
-    if not project.exists():
-        raise HTTPException(status_code=404)
-
-    return RepoIndexManager.get_index_manager(project.repo).repo_status

From e02b98bac3d287907e166b5a9b65a50eecc73e86 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <torosyanalbert@gmail.com>
Date: Fri, 21 Mar 2025 12:51:19 +0400
Subject: [PATCH 21/30] Bump up Aim to v3.28.0

---
 CHANGELOG.md            | 16 +++++++++++-----
 aim/VERSION             |  2 +-
 aim/web/ui/package.json |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0436f92e4..94d102dd8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
 # Changelog
 
-## Unreleased
+## 3.28.0 Mar 21, 2025
+
+### Enhancements:
+- Skip metrics check when run is known to yield false result (alberttorosyan)
+- Remove metric version check to improve performance of metric retrieval (mihran113)
+- Move indexing thread to main process of `aim up` (alberttorosyan)
+- Add AimCallback implementation for hugging face distributed runs (VassilisVassiliadis)
+- Add py.typed marker to allow usage of existing type annotations (bluenote10)
+
 
 ### Fixes:
 - Decrease client resources keep-alive time (mihran113)
@@ -8,11 +16,9 @@
 - Resolve issue with adding duplicate tags to the same run (mihran113)
 - Improve error messages for remote tracking server (mihran113)
 - Fix spurious assertion error in message stream parsing (qzed)
-
-### Enhancements:
-- Skip metrics check when run is known to yield false result (alberttorosyan)
 - Correct indentation on query proxy object return statement (alberttorosyan)
-- Remove metric version check to improve performance of metric retrieval (mihran113)
+- Fix typing issues in S3ArtifactStorage implementation (sbatchelder)
+
 
 ## 3.27.0 Dec 18, 2024
 
diff --git a/aim/VERSION b/aim/VERSION
index 8c5312044..a72fd67b6 100644
--- a/aim/VERSION
+++ b/aim/VERSION
@@ -1 +1 @@
-3.27.0
+3.28.0
diff --git a/aim/web/ui/package.json b/aim/web/ui/package.json
index 567b5732a..c9a9976a0 100644
--- a/aim/web/ui/package.json
+++ b/aim/web/ui/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ui_v2",
-  "version": "3.27.0",
+  "version": "3.28.0",
   "private": true,
   "dependencies": {
     "@aksel/structjs": "^1.0.0",

From 897459a48bd31021af50ee0b3b2c172077f41d09 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:33:44 +0400
Subject: [PATCH 22/30] [feat] Constant indexing of in-progress Runs (#3310)

---
 aim/cli/runs/commands.py       |   2 +-
 aim/cli/storage/commands.py    |   4 +-
 aim/cli/up/commands.py         |   4 +-
 aim/sdk/index_manager.py       | 272 ++++++++++++++-------------------
 aim/sdk/repo.py                |  26 ++--
 aim/storage/rockscontainer.pyx |   2 +-
 setup.py                       |   1 +
 7 files changed, 132 insertions(+), 179 deletions(-)

diff --git a/aim/cli/runs/commands.py b/aim/cli/runs/commands.py
index 9f7413db5..26a4b58d6 100644
--- a/aim/cli/runs/commands.py
+++ b/aim/cli/runs/commands.py
@@ -192,7 +192,7 @@ def update_metrics(ctx, yes):
     if not confirmed:
         return
 
-    index_manager = RepoIndexManager.get_index_manager(repo)
+    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
     hashes = repo.list_all_runs()
     for run_hash in tqdm.tqdm(hashes, desc='Updating runs', total=len(hashes)):
         meta_tree = repo.request_tree('meta', run_hash, read_only=False, from_union=False)
diff --git a/aim/cli/storage/commands.py b/aim/cli/storage/commands.py
index 3210f7a69..32bfe01d5 100644
--- a/aim/cli/storage/commands.py
+++ b/aim/cli/storage/commands.py
@@ -51,7 +51,7 @@ def to_3_11(ctx, hashes, yes):
     if not confirmed:
         return
 
-    index_manager = RepoIndexManager.get_index_manager(repo)
+    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
     for run_hash in tqdm(matched_hashes):
         try:
             run = Run(run_hash, repo=repo)
@@ -97,7 +97,7 @@ def restore_runs(ctx, hashes, yes):
         return
 
     remaining_runs = []
-    index_manager = RepoIndexManager.get_index_manager(repo)
+    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
     for run_hash in tqdm(matched_hashes):
         try:
             restore_run_backup(repo, run_hash)
diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index 42a23f50c..e294c4f9b 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -123,9 +123,7 @@ def up(
     if profiler:
         os.environ[AIM_PROFILER_KEY] = '1'
 
-    index_mng = RepoIndexManager.get_index_manager(repo_inst)
-    index_mng.start_indexing_thread()
-
+    RepoIndexManager.get_index_manager(repo_inst)
     try:
         server_cmd = build_uvicorn_command(
             'aim.web.run:app',
diff --git a/aim/sdk/index_manager.py b/aim/sdk/index_manager.py
index 7c26cb2bf..f7e01502b 100644
--- a/aim/sdk/index_manager.py
+++ b/aim/sdk/index_manager.py
@@ -1,191 +1,143 @@
-import contextlib
-import datetime
 import logging
 import os
-import time
+import queue
+import threading
 
 from pathlib import Path
-from threading import Thread
-from typing import Iterable
 
 import aimrocks.errors
-import pytz
 
 from aim.sdk.repo import Repo
-from aim.sdk.run_status_watcher import Event
-from aim.storage.locking import RefreshLock
+from watchdog.events import FileSystemEventHandler
+from watchdog.observers import Observer
 
 
 logger = logging.getLogger(__name__)
 
 
+class NewChunkCreatedHandler(FileSystemEventHandler):
+    def __init__(self, manager):
+        self.manager = manager
+
+    def on_created(self, event):
+        if event.is_directory:
+            chunk_name = os.path.basename(event.src_path)
+            logger.debug(f'Detected new chunk directory: {chunk_name}')
+            self.manager.monitor_chunk_directory(event.src_path)
+
+
+class ChunkChangedHandler(FileSystemEventHandler):
+    def __init__(self, manager):
+        self.manager = manager
+        self.pending_events = set()
+        self.lock = threading.Lock()
+
+    def _trigger_event(self, run_hash):
+        with self.lock:
+            if run_hash not in self.pending_events:
+                self.pending_events.add(run_hash)
+                threading.Timer(0.5, self._process_event, [run_hash]).start()
+
+    def _process_event(self, run_hash):
+        with self.lock:
+            if run_hash in self.pending_events:
+                self.pending_events.remove(run_hash)
+                logger.debug(f'Triggering indexing for run {run_hash}')
+                self.manager.add_run_to_queue(run_hash)
+
+    def on_any_event(self, event):
+        if event.is_directory:
+            return
+
+        event_path = Path(event.src_path)
+        parent_dir = event_path.parent
+        run_hash = parent_dir.name
+
+        # Ensure the parent directory is directly inside meta/chunks/
+        if parent_dir.parent != self.manager.chunks_dir:
+            logger.debug(f'Skipping event outside valid chunk directory: {event.src_path}')
+            return
+
+        if event_path.name.startswith('LOG'):
+            logger.debug(f'Skipping event for LOG-prefixed file: {event.src_path}')
+            return
+
+        logger.debug(f'Detected change in {event.src_path}')
+        self._trigger_event(run_hash)
+
+
 class RepoIndexManager:
     index_manager_pool = {}
-    INDEXING_GRACE_PERIOD = 10
 
     @classmethod
-    def get_index_manager(cls, repo: Repo):
+    def get_index_manager(cls, repo: Repo, disable_monitoring: bool = False):
         mng = cls.index_manager_pool.get(repo.path, None)
         if mng is None:
-            mng = RepoIndexManager(repo)
+            mng = RepoIndexManager(repo, disable_monitoring)
             cls.index_manager_pool[repo.path] = mng
         return mng
 
-    def __init__(self, repo: Repo):
+    def __init__(self, repo: Repo, disable_monitoring: bool):
         self.repo_path = repo.path
         self.repo = repo
-        self.progress_dir = Path(self.repo_path) / 'meta' / 'progress'
-        self.progress_dir.mkdir(parents=True, exist_ok=True)
-
-        self.heartbeat_dir = Path(self.repo_path) / 'check_ins'
-        self.run_heartbeat_cache = {}
+        self.chunks_dir = Path(self.repo_path) / 'meta' / 'chunks'
+        self.chunks_dir.mkdir(parents=True, exist_ok=True)
 
-        self._indexing_in_progress = False
-        self._reindex_thread: Thread = None
         self._corrupted_runs = set()
 
-    @property
-    def repo_status(self):
-        if self._indexing_in_progress is True:
-            return 'indexing in progress'
-        if self.reindex_needed:
-            return 'needs indexing'
-        return 'up-to-date'
-
-    @property
-    def reindex_needed(self) -> bool:
-        runs_with_progress = os.listdir(self.progress_dir)
-        return len(runs_with_progress) > 0
-
-    def start_indexing_thread(self):
-        logger.info(f"Starting indexing thread for repo '{self.repo_path}'")
-        self._reindex_thread = Thread(target=self._run_forever, daemon=True)
-        self._reindex_thread.start()
-
-    def _run_forever(self):
-        idle_cycles = 0
-        while True:
-            self._indexing_in_progress = False
-            for run_hash in self._next_stalled_run():
-                logger.info(f'Found un-indexed run {run_hash}. Indexing...')
-                self._indexing_in_progress = True
-                idle_cycles = 0
-                self.index(run_hash)
-
-                # sleep for small interval to release index db lock in between and allow
-                # other running jobs to properly finalize and index Run.
-                sleep_interval = 0.1
-                time.sleep(sleep_interval)
-            if not self._indexing_in_progress:
-                idle_cycles += 1
-                sleep_interval = 2 * idle_cycles if idle_cycles < 5 else 10
-                logger.info(
-                    f'No un-indexed runs found. Next check will run in {sleep_interval} seconds. '
-                    f'Waiting for un-indexed run...'
-                )
-                time.sleep(sleep_interval)
-
-    def _runs_with_progress(self) -> Iterable[str]:
-        runs_with_progress = filter(lambda x: x not in self._corrupted_runs, os.listdir(self.progress_dir))
-        run_hashes = sorted(runs_with_progress, key=lambda r: os.path.getmtime(os.path.join(self.progress_dir, r)))
-        return run_hashes
-
-    def _next_stalled_run(self):
-        for run_hash in self._runs_with_progress():
-            if self._is_run_stalled(run_hash):
-                yield run_hash
-
-    def _is_run_stalled(self, run_hash: str) -> bool:
-        stalled = False
-        heartbeat_files = list(sorted(self.heartbeat_dir.glob(f'{run_hash}-*-progress-*-*'), reverse=True))
-        if heartbeat_files:
-            last_heartbeat = Event(heartbeat_files[0].name)
-            last_recorded_heartbeat = self.run_heartbeat_cache.get(run_hash)
-            if last_recorded_heartbeat is None:
-                self.run_heartbeat_cache[run_hash] = last_heartbeat
-            elif last_heartbeat.idx > last_recorded_heartbeat.idx:
-                self.run_heartbeat_cache[run_hash] = last_heartbeat
-            else:
-                time_passed = time.time() - last_recorded_heartbeat.detected_epoch_time
-                if last_recorded_heartbeat.next_event_in + RepoIndexManager.INDEXING_GRACE_PERIOD < time_passed:
-                    stalled = True
+        if not disable_monitoring:
+            self.indexing_queue = queue.PriorityQueue()
+            self.lock = threading.Lock()
+
+            self.observer = Observer()
+            self.new_chunk_handler = NewChunkCreatedHandler(self)
+            self.chunk_change_handler = ChunkChangedHandler(self)
+
+            self.observer.schedule(self.new_chunk_handler, self.chunks_dir, recursive=True)
+            self._monitor_existing_chunks()
+            self.observer.start()
+
+            self._reindex_thread = threading.Thread(target=self._process_queue, daemon=True)
+            self._reindex_thread.start()
+
+    def _monitor_existing_chunks(self):
+        for chunk_path in self.chunks_dir.iterdir():
+            if chunk_path.is_dir():
+                logger.debug(f'Monitoring existing chunk: {chunk_path}')
+                self.monitor_chunk_directory(chunk_path)
+
+    def monitor_chunk_directory(self, chunk_path):
+        """Ensure chunk directory is monitored using a single handler."""
+        if str(chunk_path) not in self.observer._watches:
+            self.observer.schedule(self.chunk_change_handler, chunk_path, recursive=True)
+            logger.debug(f'Started monitoring chunk directory: {chunk_path}')
         else:
-            stalled = True
-        return stalled
+            logger.debug(f'Chunk directory already monitored: {chunk_path}')
 
-    def _index_lock_path(self):
-        return Path(self.repo.path) / 'locks' / 'index'
+    def add_run_to_queue(self, run_hash):
+        if run_hash in self._corrupted_runs:
+            return
+        timestamp = os.path.getmtime(os.path.join(self.chunks_dir, run_hash))
+        with self.lock:
+            self.indexing_queue.put((timestamp, run_hash))
+        logger.debug(f'Run {run_hash} added to indexing queue with timestamp {timestamp}')
 
-    @contextlib.contextmanager
-    def lock_index(self, lock: RefreshLock):
-        try:
-            self._safe_acquire_lock(lock)
-            yield
-        finally:
-            lock.release()
-
-    def _safe_acquire_lock(self, lock: RefreshLock):
-        last_touch_seen = None
-        prev_touch_time = None
-        last_owner_id = None
+    def _process_queue(self):
         while True:
-            try:
-                lock.acquire()
-                logger.debug('Lock is acquired!')
-                break
-            except TimeoutError:
-                owner_id = lock.owner_id()
-                if owner_id != last_owner_id:
-                    logger.debug(f'Lock has been acquired by {owner_id}')
-                    last_owner_id = owner_id
-                    prev_touch_time = None
-                else:  # same holder as from prev. iteration
-                    last_touch_time = lock.last_refresh_time()
-                    if last_touch_time != prev_touch_time:
-                        prev_touch_time = last_touch_time
-                        last_touch_seen = time.time()
-                        logger.debug(f'Lock has been refreshed. Touch time: {last_touch_time}')
-                        continue
-                    assert last_touch_seen is not None
-                    if time.time() - last_touch_seen > RefreshLock.GRACE_PERIOD:
-                        logger.debug('Grace period exceeded. Force-acquiring the lock.')
-                        with lock.meta_lock():
-                            # double check holder ID
-                            if lock.owner_id() != last_owner_id:  # someone else grabbed lock
-                                continue
-                            else:
-                                lock.force_release()
-                                try:
-                                    lock.acquire()
-                                    logger.debug('lock has been forcefully acquired!')
-                                    break
-                                except TimeoutError:
-                                    continue
-                    else:
-                        logger.debug(
-                            f'Countdown to force-acquire lock. '
-                            f'Time remaining: {RefreshLock.GRACE_PERIOD - (time.time() - last_touch_seen)}'
-                        )
-
-    def run_needs_indexing(self, run_hash: str) -> bool:
-        return os.path.exists(self.progress_dir / run_hash)
-
-    def index(
-        self,
-        run_hash,
-    ) -> bool:
-        lock = RefreshLock(self._index_lock_path(), timeout=10)
-        with self.lock_index(lock):
-            index = self.repo._get_index_tree('meta', 0).view(())
-            try:
-                meta_tree = self.repo.request_tree(
-                    'meta', run_hash, read_only=True, from_union=False, no_cache=True
-                ).subtree('meta')
-                meta_run_tree = meta_tree.subtree('chunks').subtree(run_hash)
-                meta_run_tree.finalize(index=index)
-                if meta_run_tree.get('end_time') is None:
-                    index['meta', 'chunks', run_hash, 'end_time'] = datetime.datetime.now(pytz.utc).timestamp()
-            except (aimrocks.errors.RocksIOError, aimrocks.errors.Corruption):
-                logger.warning(f"Indexing thread detected corrupted run '{run_hash}'. Skipping.")
-                self._corrupted_runs.add(run_hash)
-            return True
+            _, run_hash = self.indexing_queue.get()
+            logger.debug(f'Indexing run {run_hash}...')
+            self.index(run_hash)
+            self.indexing_queue.task_done()
+
+    def index(self, run_hash):
+        index = self.repo._get_index_tree('meta', 0).view(())
+        try:
+            meta_tree = self.repo.request_tree(
+                'meta', run_hash, read_only=True, from_union=False, no_cache=True, skip_read_optimization=True
+            ).subtree('meta')
+            meta_run_tree = meta_tree.subtree('chunks').subtree(run_hash)
+            meta_run_tree.finalize(index=index)
+        except (aimrocks.errors.RocksIOError, aimrocks.errors.Corruption):
+            logger.warning(f"Indexing thread detected corrupted run '{run_hash}'. Skipping.")
+            self._corrupted_runs.add(run_hash)
+        return True
diff --git a/aim/sdk/repo.py b/aim/sdk/repo.py
index 6d2471cb2..b37838421 100644
--- a/aim/sdk/repo.py
+++ b/aim/sdk/repo.py
@@ -269,19 +269,22 @@ def get_version(cls, path: str):
     def is_remote_path(cls, path: str):
         return path.startswith('aim://')
 
-    def _get_container(self, name: str, read_only: bool, from_union: bool = False) -> Container:
+    def _get_container(self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False) -> Container:
+        # TODO [AT]: refactor get container/tree logic to make it more simple
         if self.read_only and not read_only:
             raise ValueError('Repo is read-only')
 
         container_config = ContainerConfig(name, None, read_only=read_only)
         container = self.container_pool.get(container_config)
         if container is None:
-            path = os.path.join(self.path, name)
             if from_union:
-                container = RocksUnionContainer(path, read_only=read_only)
+                # Temporarily use index db when getting data from union.
+                path = os.path.join(self.path, name, 'index')
+                container = RocksContainer(path, read_only=read_only, skip_read_optimization=skip_read_optimization)
                 self.persistent_pool[container_config] = container
             else:
-                container = RocksContainer(path, read_only=read_only)
+                path = os.path.join(self.path, name)
+                container = RocksContainer(path, read_only=read_only, skip_read_optimization=skip_read_optimization)
             self.container_pool[container_config] = container
 
         return container
@@ -314,9 +317,11 @@ def request_tree(
         read_only: bool,
         from_union: bool = False,  # TODO maybe = True by default
         no_cache: bool = False,
+        skip_read_optimization: bool = False
     ):
         if not self.is_remote_repo:
-            return self.request(name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache).tree()
+            return self.request(name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache,
+                                skip_read_optimization=skip_read_optimization).tree()
         else:
             return ProxyTree(self._client, name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache)
 
@@ -328,6 +333,7 @@ def request(
         read_only: bool,
         from_union: bool = False,  # TODO maybe = True by default
         no_cache: bool = False,
+        skip_read_optimization: bool = False
     ):
         container_config = ContainerConfig(name, sub, read_only)
         container_view = self.container_view_pool.get(container_config)
@@ -338,7 +344,8 @@ def request(
                 else:
                     assert sub is not None
                     path = os.path.join(name, 'chunks', sub)
-                container = self._get_container(path, read_only=True, from_union=from_union)
+                container = self._get_container(path, read_only=True, from_union=from_union,
+                                                skip_read_optimization=skip_read_optimization)
             else:
                 assert sub is not None
                 path = os.path.join(name, 'chunks', sub)
@@ -1005,10 +1012,7 @@ def optimize_container(path, extra_options):
         if self.is_remote_repo:
             self._remote_repo_proxy._close_run(run_hash)
 
-        from aim.sdk.index_manager import RepoIndexManager
-
         lock_manager = LockManager(self.path)
-        index_manager = RepoIndexManager.get_index_manager(self)
 
         if lock_manager.release_locks(run_hash, force=True):
             # Run rocksdb optimizations if container locks are removed
@@ -1016,8 +1020,6 @@ def optimize_container(path, extra_options):
             seqs_db_path = os.path.join(self.path, 'seqs', 'chunks', run_hash)
             optimize_container(meta_db_path, extra_options={'compaction': True})
             optimize_container(seqs_db_path, extra_options={})
-        if index_manager.run_needs_indexing(run_hash):
-            index_manager.index(run_hash)
 
     def _recreate_index(self):
         from tqdm import tqdm
@@ -1028,7 +1030,7 @@ def _recreate_index(self):
 
         from aim.sdk.index_manager import RepoIndexManager
 
-        index_manager = RepoIndexManager.get_index_manager(self)
+        index_manager = RepoIndexManager.get_index_manager(self, disable_monitoring=True)
 
         # force delete the index db and the locks
 
diff --git a/aim/storage/rockscontainer.pyx b/aim/storage/rockscontainer.pyx
index 1be6f9086..d21e28c32 100644
--- a/aim/storage/rockscontainer.pyx
+++ b/aim/storage/rockscontainer.pyx
@@ -144,7 +144,7 @@ class RocksContainer(Container):
             lock_cls = self.get_lock_cls()
             self._lock = lock_cls(self._lock_path, timeout)
             self._lock.acquire()
-        else:
+        elif not self._extra_opts.get('skip_read_optimization', False):
             self.optimize_for_read()
 
         self._db = aimrocks.DB(str(self.path),
diff --git a/setup.py b/setup.py
index 00725b280..983b38016 100644
--- a/setup.py
+++ b/setup.py
@@ -76,6 +76,7 @@ def package_files(directory):
     'packaging>=15.0',
     'python-dateutil',
     'requests',
+    'watchdog',
     'websockets',
     'boto3',
 ]

From e206b50bfcfb21e9368b968f70f4391b44139231 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Wed, 2 Apr 2025 15:46:34 +0400
Subject: [PATCH 23/30] [fix] Resolve issue of min/max calculation for single
 point metrics (#3315)

---
 CHANGELOG.md                               |  5 +++++
 aim/web/ui/src/utils/aggregateGroupData.ts | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94d102dd8..d55c7567e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Unreleased:
+
+### Fixes:
+- Fix min/max calculation for single point metrics (mihran113)
+
 ## 3.28.0 Mar 21, 2025
 
 ### Enhancements:
diff --git a/aim/web/ui/src/utils/aggregateGroupData.ts b/aim/web/ui/src/utils/aggregateGroupData.ts
index 0a0acc69e..2fa05644a 100644
--- a/aim/web/ui/src/utils/aggregateGroupData.ts
+++ b/aim/web/ui/src/utils/aggregateGroupData.ts
@@ -113,6 +113,17 @@ export function aggregateGroupData({
             }
           }
         }
+        // add special case handling for single point metrics
+        if (trace.xValues.length === 1) {
+          const step = trace.xValues[0];
+          let value = chartXValues.indexOf(step);
+          let y = trace.yValues[0];
+          if (yValuesPerX.hasOwnProperty(value)) {
+            yValuesPerX[value].push(y);
+          } else {
+            yValuesPerX[value] = [y];
+          }
+        }
       }
     }
 

From 943942c54e613697f29babe56dde0dc1d18f69e7 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Thu, 3 Apr 2025 16:09:34 +0400
Subject: [PATCH 24/30] [fix] Use polling observer to make sure new file
 modifications are detected (#3316)

---
 aim/sdk/index_manager.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/aim/sdk/index_manager.py b/aim/sdk/index_manager.py
index f7e01502b..ff8eaa723 100644
--- a/aim/sdk/index_manager.py
+++ b/aim/sdk/index_manager.py
@@ -10,6 +10,7 @@
 from aim.sdk.repo import Repo
 from watchdog.events import FileSystemEventHandler
 from watchdog.observers import Observer
+from watchdog.observers.polling import PollingObserver
 
 
 logger = logging.getLogger(__name__)
@@ -20,7 +21,7 @@ def __init__(self, manager):
         self.manager = manager
 
     def on_created(self, event):
-        if event.is_directory:
+        if event.is_directory and Path(event.src_path).parent == self.manager.chunks_dir:
             chunk_name = os.path.basename(event.src_path)
             logger.debug(f'Detected new chunk directory: {chunk_name}')
             self.manager.monitor_chunk_directory(event.src_path)
@@ -89,13 +90,17 @@ def __init__(self, repo: Repo, disable_monitoring: bool):
             self.indexing_queue = queue.PriorityQueue()
             self.lock = threading.Lock()
 
-            self.observer = Observer()
+            self.new_chunk_observer = Observer()
+            self.chunk_change_observer = PollingObserver()
+
             self.new_chunk_handler = NewChunkCreatedHandler(self)
             self.chunk_change_handler = ChunkChangedHandler(self)
 
-            self.observer.schedule(self.new_chunk_handler, self.chunks_dir, recursive=True)
+            self.new_chunk_observer.schedule(self.new_chunk_handler, self.chunks_dir, recursive=True)
+            self.new_chunk_observer.start()
+
             self._monitor_existing_chunks()
-            self.observer.start()
+            self.chunk_change_observer.start()
 
             self._reindex_thread = threading.Thread(target=self._process_queue, daemon=True)
             self._reindex_thread.start()
@@ -108,8 +113,8 @@ def _monitor_existing_chunks(self):
 
     def monitor_chunk_directory(self, chunk_path):
         """Ensure chunk directory is monitored using a single handler."""
-        if str(chunk_path) not in self.observer._watches:
-            self.observer.schedule(self.chunk_change_handler, chunk_path, recursive=True)
+        if str(chunk_path) not in self.chunk_change_observer._watches:
+            self.chunk_change_observer.schedule(self.chunk_change_handler, chunk_path, recursive=True)
             logger.debug(f'Started monitoring chunk directory: {chunk_path}')
         else:
             logger.debug(f'Chunk directory already monitored: {chunk_path}')

From 02bdcdd21203e36f6f315e5050f42db836d639f5 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Fri, 4 Apr 2025 16:21:15 +0400
Subject: [PATCH 25/30] [feat] Mark stalled runs as finished (#3314)

---
 aim/cli/up/commands.py                    |   3 +
 aim/sdk/repo.py                           |  23 ++-
 aim/sdk/reporter/file_manager.py          |   6 +-
 aim/sdk/run_status_manager.py             |  95 ++++++++++++
 aim/sdk/run_status_watcher.py             |   9 +-
 aim/storage/arrayview.py                  |  12 +-
 aim/storage/artifacts/artifact_storage.py |   9 +-
 aim/storage/inmemorytreeview.py           |   6 +-
 aim/storage/query.py                      |   3 +-
 aim/storage/rockscontainer.pyx            |  12 +-
 aim/storage/structured/entities.py        | 171 ++++++++++++++--------
 aim/storage/treeview.py                   |  39 +++--
 12 files changed, 289 insertions(+), 99 deletions(-)
 create mode 100644 aim/sdk/run_status_manager.py

diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index e294c4f9b..4775e0aa8 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -13,6 +13,7 @@
 )
 from aim.sdk.index_manager import RepoIndexManager
 from aim.sdk.repo import Repo
+from aim.sdk.run_status_manager import RunStatusManager
 from aim.sdk.utils import clean_repo_path
 from aim.web.configs import (
     AIM_ENV_MODE_KEY,
@@ -124,6 +125,8 @@ def up(
         os.environ[AIM_PROFILER_KEY] = '1'
 
     RepoIndexManager.get_index_manager(repo_inst)
+    run_status_mng = RunStatusManager(repo_inst)
+    run_status_mng.start()
     try:
         server_cmd = build_uvicorn_command(
             'aim.web.run:app',
diff --git a/aim/sdk/repo.py b/aim/sdk/repo.py
index b37838421..151a56f86 100644
--- a/aim/sdk/repo.py
+++ b/aim/sdk/repo.py
@@ -269,7 +269,9 @@ def get_version(cls, path: str):
     def is_remote_path(cls, path: str):
         return path.startswith('aim://')
 
-    def _get_container(self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False) -> Container:
+    def _get_container(
+        self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False
+    ) -> Container:
         # TODO [AT]: refactor get container/tree logic to make it more simple
         if self.read_only and not read_only:
             raise ValueError('Repo is read-only')
@@ -317,11 +319,17 @@ def request_tree(
         read_only: bool,
         from_union: bool = False,  # TODO maybe = True by default
         no_cache: bool = False,
-        skip_read_optimization: bool = False
+        skip_read_optimization: bool = False,
     ):
         if not self.is_remote_repo:
-            return self.request(name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache,
-                                skip_read_optimization=skip_read_optimization).tree()
+            return self.request(
+                name,
+                sub,
+                read_only=read_only,
+                from_union=from_union,
+                no_cache=no_cache,
+                skip_read_optimization=skip_read_optimization,
+            ).tree()
         else:
             return ProxyTree(self._client, name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache)
 
@@ -333,7 +341,7 @@ def request(
         read_only: bool,
         from_union: bool = False,  # TODO maybe = True by default
         no_cache: bool = False,
-        skip_read_optimization: bool = False
+        skip_read_optimization: bool = False,
     ):
         container_config = ContainerConfig(name, sub, read_only)
         container_view = self.container_view_pool.get(container_config)
@@ -344,8 +352,9 @@ def request(
                 else:
                     assert sub is not None
                     path = os.path.join(name, 'chunks', sub)
-                container = self._get_container(path, read_only=True, from_union=from_union,
-                                                skip_read_optimization=skip_read_optimization)
+                container = self._get_container(
+                    path, read_only=True, from_union=from_union, skip_read_optimization=skip_read_optimization
+                )
             else:
                 assert sub is not None
                 path = os.path.join(name, 'chunks', sub)
diff --git a/aim/sdk/reporter/file_manager.py b/aim/sdk/reporter/file_manager.py
index 80c2d9a85..72633f084 100644
--- a/aim/sdk/reporter/file_manager.py
+++ b/aim/sdk/reporter/file_manager.py
@@ -10,10 +10,12 @@
 
 class FileManager(object):
     @abstractmethod
-    def poll(self, pattern: str) -> Optional[str]: ...
+    def poll(self, pattern: str) -> Optional[str]:
+        ...
 
     @abstractmethod
-    def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None): ...
+    def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None):
+        ...
 
 
 class LocalFileManager(FileManager):
diff --git a/aim/sdk/run_status_manager.py b/aim/sdk/run_status_manager.py
new file mode 100644
index 000000000..71dc42eeb
--- /dev/null
+++ b/aim/sdk/run_status_manager.py
@@ -0,0 +1,95 @@
+import time
+import os
+import datetime
+import pytz
+import threading
+from pathlib import Path
+
+from typing import Iterable
+
+import aimrocks.errors
+
+from aim import Repo
+from aim.sdk.run_status_watcher import Event
+
+
+class RunStatusManager:
+    INDEXING_GRACE_PERIOD = 10
+
+    def __init__(self, repo: Repo, scan_interval: int = 60):
+        self.repo = repo
+        self.scan_interval = scan_interval
+
+        self.progress_dir = Path(self.repo.path) / 'meta' / 'progress'
+        self.progress_dir.mkdir(parents=True, exist_ok=True)
+
+        self.heartbeat_dir = Path(self.repo.path) / 'check_ins'
+        self.run_heartbeat_cache = {}
+
+        self._stop_event = threading.Event()
+        self._monitor_thread = None
+        self._corrupted_runs = set()
+
+    def start(self):
+        if not self._monitor_thread or not self._monitor_thread.is_alive():
+            self._stop_event.clear()
+            self._monitor_thread = threading.Thread(target=self._run_forever, daemon=True)
+            self._monitor_thread.start()
+
+    def stop(self):
+        self._stop_event.set()
+        if self._monitor_thread:
+            self._monitor_thread.join()
+
+    def _run_forever(self):
+        while not self._stop_event.is_set():
+            self.check_and_terminate_stalled_runs()
+            time.sleep(self.scan_interval)
+
+    def _runs_with_progress(self) -> Iterable[str]:
+        runs_with_progress = filter(lambda x: x not in self._corrupted_runs, os.listdir(self.progress_dir))
+        run_hashes = sorted(runs_with_progress, key=lambda r: os.path.getmtime(os.path.join(self.progress_dir, r)))
+        return run_hashes
+
+    def check_and_terminate_stalled_runs(self):
+        for run_hash in self._runs_with_progress():
+            if self._is_run_stalled(run_hash):
+                self._mark_run_as_terminated(run_hash)
+
+    def _is_run_stalled(self, run_hash: str) -> bool:
+        stalled = False
+
+        heartbeat_files = list(sorted(self.heartbeat_dir.glob(f'{run_hash}-*-progress-*-*'), reverse=True))
+        if heartbeat_files:
+            latest_file = heartbeat_files[0].name
+            last_heartbeat = Event(latest_file)
+
+            last_recorded_heartbeat = self.run_heartbeat_cache.get(run_hash)
+            if last_recorded_heartbeat is None:
+                # First time seeing a heartbeat for this run; store and move on
+                self.run_heartbeat_cache[run_hash] = last_heartbeat
+            elif last_heartbeat.idx > last_recorded_heartbeat.idx:
+                # Newer heartbeat arrived, so the run isn't stalled
+                self.run_heartbeat_cache[run_hash] = last_heartbeat
+            else:
+                # No new heartbeat event since last time; check if enough time passed
+                time_passed = time.time() - last_recorded_heartbeat.detected_epoch_time
+                if (last_recorded_heartbeat.next_event_in + RunStatusManager.INDEXING_GRACE_PERIOD) < time_passed:
+                    stalled = True
+        else:
+            stalled = True
+
+        return stalled
+
+    def _mark_run_as_terminated(self, run_hash: str):
+        # TODO [AT]: Add run state handling once decided on terms (finished, terminated, aborted, etc.)
+        try:
+            meta_run_tree = self.repo.request_tree('meta', run_hash, read_only=False).subtree(
+                ('meta', 'chunks', run_hash)
+            )
+            if meta_run_tree.get('end_time') is None:
+                meta_run_tree['end_time'] = datetime.datetime.now(pytz.utc).timestamp()
+            progress_path = self.progress_dir / run_hash
+            progress_path.unlink(missing_ok=True)
+        except (aimrocks.errors.RocksIOError, aimrocks.errors.Corruption):
+            self._corrupted_runs.add(run_hash)
diff --git a/aim/sdk/run_status_watcher.py b/aim/sdk/run_status_watcher.py
index 422cbff12..ccf203bd5 100644
--- a/aim/sdk/run_status_watcher.py
+++ b/aim/sdk/run_status_watcher.py
@@ -83,13 +83,16 @@ def __init__(self, *, obj_idx: Optional[str] = None, rank: Optional[int] = None,
         self.message = message
 
     @abstractmethod
-    def is_sent(self): ...
+    def is_sent(self):
+        ...
 
     @abstractmethod
-    def update_last_sent(self): ...
+    def update_last_sent(self):
+        ...
 
     @abstractmethod
-    def get_msg_details(self): ...
+    def get_msg_details(self):
+        ...
 
 
 class StatusNotification(Notification):
diff --git a/aim/storage/arrayview.py b/aim/storage/arrayview.py
index 4694c1eab..2b9fd8954 100644
--- a/aim/storage/arrayview.py
+++ b/aim/storage/arrayview.py
@@ -9,7 +9,8 @@ class ArrayView:
     when index values are not important.
     """
 
-    def __iter__(self) -> Iterator[Any]: ...
+    def __iter__(self) -> Iterator[Any]:
+        ...
 
     def keys(self) -> Iterator[int]:
         """Return sparse indices iterator.
@@ -43,13 +44,16 @@ def items(self) -> Iterator[Tuple[int, Any]]:
         """
         ...
 
-    def __len__(self) -> int: ...
+    def __len__(self) -> int:
+        ...
 
-    def __getitem__(self, idx: Union[int, slice]): ...
+    def __getitem__(self, idx: Union[int, slice]):
+        ...
 
     # TODO implement append
 
-    def __setitem__(self, idx: int, val: Any): ...
+    def __setitem__(self, idx: int, val: Any):
+        ...
 
     def sparse_list(self) -> Tuple[List[int], List[Any]]:
         """Get sparse indices and values as :obj:`list`s."""
diff --git a/aim/storage/artifacts/artifact_storage.py b/aim/storage/artifacts/artifact_storage.py
index efa73cbd1..e0bab8934 100644
--- a/aim/storage/artifacts/artifact_storage.py
+++ b/aim/storage/artifacts/artifact_storage.py
@@ -7,10 +7,13 @@ def __init__(self, url: str):
         self.url = url
 
     @abstractmethod
-    def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False): ...
+    def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False):
+        ...
 
     @abstractmethod
-    def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str: ...
+    def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str:
+        ...
 
     @abstractmethod
-    def delete_artifact(self, artifact_path: str): ...
+    def delete_artifact(self, artifact_path: str):
+        ...
diff --git a/aim/storage/inmemorytreeview.py b/aim/storage/inmemorytreeview.py
index 7d02c347d..1ce208594 100644
--- a/aim/storage/inmemorytreeview.py
+++ b/aim/storage/inmemorytreeview.py
@@ -117,6 +117,8 @@ def iterlevel(
     def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> TreeArrayView:
         return TreeArrayView(self.subtree(path), dtype=dtype)
 
-    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
+    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
+        ...
 
-    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
+    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
+        ...
diff --git a/aim/storage/query.py b/aim/storage/query.py
index 82de23657..f8fa81fbb 100644
--- a/aim/storage/query.py
+++ b/aim/storage/query.py
@@ -80,7 +80,8 @@ def __init__(self, expr: str):
         self.expr = expr
 
     @abstractmethod
-    def check(self, **params) -> bool: ...
+    def check(self, **params) -> bool:
+        ...
 
     def __call__(self, **params):
         return self.check(**params)
diff --git a/aim/storage/rockscontainer.pyx b/aim/storage/rockscontainer.pyx
index d21e28c32..e96fc4b42 100644
--- a/aim/storage/rockscontainer.pyx
+++ b/aim/storage/rockscontainer.pyx
@@ -35,6 +35,7 @@ class RocksAutoClean(AutoClean):
         super().__init__(instance)
         self._lock = None
         self._db = None
+        self._progress_path = None
 
     def _close(self):
         """
@@ -48,6 +49,9 @@ class RocksAutoClean(AutoClean):
                 self._db = None
             self._lock.release()
             self._lock = None
+        if self._progress_path is not None:
+            self._progress_path.unlink(missing_ok=True)
+            self._progress_path = None
         if self._db is not None:
             self._db = None
 
@@ -104,6 +108,7 @@ class RocksContainer(Container):
         if not self.read_only:
             progress_dir.mkdir(parents=True, exist_ok=True)
             self._progress_path.touch(exist_ok=True)
+            self._resources._progress_path = self._progress_path
 
         self.db
         # TODO check if Containers are reopenable
@@ -159,16 +164,9 @@ class RocksContainer(Container):
         Store the collection of `(key, value)` records in the :obj:`Container`
         `index` for fast reads.
         """
-        if not self._progress_path:
-            return
-
         for k, v in self.items():
             index[k] = v
 
-        if self._progress_path.exists():
-            self._progress_path.unlink()
-        self._progress_path = None
-
     def close(self):
         """Close all the resources."""
         if self._resources is None:
diff --git a/aim/storage/structured/entities.py b/aim/storage/structured/entities.py
index 900c422ec..a43471ea7 100644
--- a/aim/storage/structured/entities.py
+++ b/aim/storage/structured/entities.py
@@ -13,224 +13,281 @@
 class StructuredObject(ABC):
     @classmethod
     @abstractmethod
-    def fields(cls): ...
+    def fields(cls):
+        ...
 
 
 class Searchable(ABC, Generic[T]):
     @classmethod
     @abstractmethod
-    def find(cls, _id: str, **kwargs) -> Optional[T]: ...
+    def find(cls, _id: str, **kwargs) -> Optional[T]:
+        ...
 
     @classmethod
     @abstractmethod
-    def all(cls, **kwargs) -> Collection[T]: ...
+    def all(cls, **kwargs) -> Collection[T]:
+        ...
 
     @classmethod
     @abstractmethod
-    def search(cls, term: str, **kwargs) -> Collection[T]: ...
+    def search(cls, term: str, **kwargs) -> Collection[T]:
+        ...
 
 
 class Run(StructuredObject, Searchable['Run']):
     @property
     @abstractmethod
-    def hash(self) -> str: ...
+    def hash(self) -> str:
+        ...
 
     @property
     @abstractmethod
-    def name(self) -> Optional[str]: ...
+    def name(self) -> Optional[str]:
+        ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str): ...
+    def name(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def description(self) -> Optional[str]: ...
+    def description(self) -> Optional[str]:
+        ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str): ...
+    def description(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool: ...
+    def archived(self) -> bool:
+        ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool): ...
+    def archived(self, value: bool):
+        ...
 
     @property
     @abstractmethod
-    def experiment(self) -> Optional['Experiment']: ...
+    def experiment(self) -> Optional['Experiment']:
+        ...
 
     @experiment.setter
     @abstractmethod
-    def experiment(self, value: str): ...
+    def experiment(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def tags(self) -> TagCollection: ...
+    def tags(self) -> TagCollection:
+        ...
 
     @abstractmethod
-    def add_tag(self, value: str) -> 'Tag': ...
+    def add_tag(self, value: str) -> 'Tag':
+        ...
 
     @abstractmethod
-    def remove_tag(self, tag_name: str) -> bool: ...
+    def remove_tag(self, tag_name: str) -> bool:
+        ...
 
     @property
     @abstractmethod
-    def info(self) -> 'RunInfo': ...
+    def info(self) -> 'RunInfo':
+        ...
 
 
 class Experiment(StructuredObject, Searchable['Experiment']):
     @property
     @abstractmethod
-    def uuid(self) -> str: ...
+    def uuid(self) -> str:
+        ...
 
     @property
     @abstractmethod
-    def name(self) -> str: ...
+    def name(self) -> str:
+        ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str): ...
+    def name(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def description(self) -> Optional[str]: ...
+    def description(self) -> Optional[str]:
+        ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str): ...
+    def description(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool: ...
+    def archived(self) -> bool:
+        ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool): ...
+    def archived(self, value: bool):
+        ...
 
     @property
     @abstractmethod
-    def runs(self) -> RunCollection: ...
+    def runs(self) -> RunCollection:
+        ...
 
 
 class Tag(StructuredObject, Searchable['Tag']):
     @property
     @abstractmethod
-    def uuid(self) -> str: ...
+    def uuid(self) -> str:
+        ...
 
     @property
     @abstractmethod
-    def name(self) -> str: ...
+    def name(self) -> str:
+        ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str): ...
+    def name(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def color(self) -> str: ...
+    def color(self) -> str:
+        ...
 
     @color.setter
     @abstractmethod
-    def color(self, value: str): ...
+    def color(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def description(self) -> str: ...
+    def description(self) -> str:
+        ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str): ...
+    def description(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool: ...
+    def archived(self) -> bool:
+        ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool): ...
+    def archived(self, value: bool):
+        ...
 
     @property
     @abstractmethod
-    def runs(self) -> RunCollection: ...
+    def runs(self) -> RunCollection:
+        ...
 
 
 class Note(StructuredObject, Searchable['Note']):
     @property
     @abstractmethod
-    def id(self) -> int: ...
+    def id(self) -> int:
+        ...
 
     @property
     @abstractmethod
-    def content(self) -> str: ...
+    def content(self) -> str:
+        ...
 
     @content.setter
     @abstractmethod
-    def content(self, value: str): ...
+    def content(self, value: str):
+        ...
 
     @property
     @abstractmethod
-    def run(self) -> int: ...
+    def run(self) -> int:
+        ...
 
 
 class RunInfo(StructuredObject, Generic[T]):
     @property
     @abstractmethod
-    def last_notification_index(self) -> int: ...
+    def last_notification_index(self) -> int:
+        ...
 
     @last_notification_index.setter
     @abstractmethod
-    def last_notification_index(self, value: int): ...
+    def last_notification_index(self, value: int):
+        ...
 
 
 class ObjectFactory:
     @abstractmethod
-    def runs(self) -> RunCollection: ...
+    def runs(self) -> RunCollection:
+        ...
 
     @abstractmethod
-    def search_runs(self, term: str) -> RunCollection: ...
+    def search_runs(self, term: str) -> RunCollection:
+        ...
 
     @abstractmethod
-    def find_run(self, _id: str) -> Run: ...
+    def find_run(self, _id: str) -> Run:
+        ...
 
     @abstractmethod
-    def find_runs(self, ids: List[str]) -> List[Run]: ...
+    def find_runs(self, ids: List[str]) -> List[Run]:
+        ...
 
     @abstractmethod
-    def create_run(self, runhash: str) -> Run: ...
+    def create_run(self, runhash: str) -> Run:
+        ...
 
     @abstractmethod
-    def delete_run(self, runhash: str) -> bool: ...
+    def delete_run(self, runhash: str) -> bool:
+        ...
 
     @abstractmethod
-    def experiments(self) -> ExperimentCollection: ...
+    def experiments(self) -> ExperimentCollection:
+        ...
 
     @abstractmethod
-    def search_experiments(self, term: str) -> ExperimentCollection: ...
+    def search_experiments(self, term: str) -> ExperimentCollection:
+        ...
 
     @abstractmethod
-    def find_experiment(self, _id: str) -> Experiment: ...
+    def find_experiment(self, _id: str) -> Experiment:
+        ...
 
     @abstractmethod
-    def create_experiment(self, name: str) -> Experiment: ...
+    def create_experiment(self, name: str) -> Experiment:
+        ...
 
     @abstractmethod
-    def delete_experiment(self, _id: str) -> bool: ...
+    def delete_experiment(self, _id: str) -> bool:
+        ...
 
     @abstractmethod
-    def tags(self) -> TagCollection: ...
+    def tags(self) -> TagCollection:
+        ...
 
     @abstractmethod
-    def search_tags(self, term: str) -> TagCollection: ...
+    def search_tags(self, term: str) -> TagCollection:
+        ...
 
     @abstractmethod
-    def find_tag(self, _id: str) -> Tag: ...
+    def find_tag(self, _id: str) -> Tag:
+        ...
 
     @abstractmethod
-    def create_tag(self, name: str) -> Tag: ...
+    def create_tag(self, name: str) -> Tag:
+        ...
 
     @abstractmethod
-    def delete_tag(self, name: str) -> bool: ...
+    def delete_tag(self, name: str) -> bool:
+        ...
diff --git a/aim/storage/treeview.py b/aim/storage/treeview.py
index fc05a06f6..f80beff50 100644
--- a/aim/storage/treeview.py
+++ b/aim/storage/treeview.py
@@ -8,21 +8,26 @@
 
 
 class TreeView:
-    def preload(self): ...
+    def preload(self):
+        ...
 
-    def finalize(self, index: 'TreeView'): ...
+    def finalize(self, index: 'TreeView'):
+        ...
 
     def subtree(self, path: Union[AimObjectKey, AimObjectPath]) -> 'TreeView':
         # Default to:
         return self.view(path, resolve=False)
 
-    def view(self, path: Union[AimObjectKey, AimObjectPath], resolve: bool = False): ...
+    def view(self, path: Union[AimObjectKey, AimObjectPath], resolve: bool = False):
+        ...
 
-    def make_array(self, path: Union[AimObjectKey, AimObjectPath] = ()): ...
+    def make_array(self, path: Union[AimObjectKey, AimObjectPath] = ()):
+        ...
 
     def collect(
         self, path: Union[AimObjectKey, AimObjectPath] = (), strict: bool = True, resolve_objects: bool = False
-    ) -> AimObject: ...
+    ) -> AimObject:
+        ...
 
     def __getitem__(self, path: Union[AimObjectKey, AimObjectPath]) -> AimObject:
         return self.collect(path)
@@ -33,7 +38,8 @@ def get(self, path: Union[AimObjectKey, AimObjectPath] = (), default: Any = None
         except KeyError:
             return default
 
-    def __delitem__(self, path: Union[AimObjectKey, AimObjectPath]): ...
+    def __delitem__(self, path: Union[AimObjectKey, AimObjectPath]):
+        ...
 
     def set(self, path: Union[AimObjectKey, AimObjectPath], value: AimObject, strict: bool = True):
         self.__setitem__(path, value)
@@ -45,18 +51,25 @@ def __setitem__(self, path: Union[AimObjectKey, AimObjectPath], value: AimObject
     def keys_eager(
         self,
         path: Union[AimObjectKey, AimObjectPath] = (),
-    ): ...
+    ):
+        ...
 
     def keys(
         self, path: Union[AimObjectKey, AimObjectPath] = (), level: int = None
-    ) -> Iterator[Union[AimObjectPath, AimObjectKey]]: ...
+    ) -> Iterator[Union[AimObjectPath, AimObjectKey]]:
+        ...
 
-    def items_eager(self, path: Union[AimObjectKey, AimObjectPath] = ()): ...
+    def items_eager(self, path: Union[AimObjectKey, AimObjectPath] = ()):
+        ...
 
-    def items(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> Iterator[Tuple[AimObjectKey, AimObject]]: ...
+    def items(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> Iterator[Tuple[AimObjectKey, AimObject]]:
+        ...
 
-    def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> 'ArrayView': ...
+    def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> 'ArrayView':
+        ...
 
-    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
+    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
+        ...
 
-    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
+    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
+        ...

From 9ee40a256d40d9aa2361529444f525a9b6b33a8a Mon Sep 17 00:00:00 2001
From: Larissa Poghosyan <43134338+larissapoghosyan@users.noreply.github.com>
Date: Tue, 8 Apr 2025 15:28:52 +0100
Subject: [PATCH 26/30] [fix] Aim web ui integration in jupyter/colab (#3319)

* api endpoint /status is not implemented, but we can rely on status code for /projects

* implement retrying with exponential backoff
---
 aim/cli/manager/manager.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/aim/cli/manager/manager.py b/aim/cli/manager/manager.py
index 99922e708..52385b9d4 100644
--- a/aim/cli/manager/manager.py
+++ b/aim/cli/manager/manager.py
@@ -33,18 +33,18 @@ def check_startup_success():
         import requests
 
         server_path = 'http://{}:{}{}'.format(args['--host'], args['--port'], args['--base-path'])
-        status_api = f'{server_path}/api/projects/status'
-        retry_count = 5
-        sleep_interval = 1
+        status_api = f'{server_path}/api/projects/'
+        retry_count = 10
+        sleep_interval = 0.1
         for _ in range(retry_count):
+            time.sleep(sleep_interval)
+            sleep_interval *= 2
             try:
                 response = requests.get(status_api)
                 if response.status_code == 200:
                     return True
             except Exception:
                 pass
-            sleep_interval += 1
-            time.sleep(sleep_interval)
 
         return False
 

From 6a559f382bbbb63da6ee69e210d514492fcae087 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <32957250+alberttorosyan@users.noreply.github.com>
Date: Wed, 30 Apr 2025 16:40:44 +0400
Subject: [PATCH 27/30] [fix] Fallback to union db if index is missing (#3317)

Co-authored-by: mihran113 <vanyanmihran@gmail.com>
---
 aim/cli/runs/commands.py                  |   4 +-
 aim/cli/storage/commands.py               |   4 +-
 aim/cli/up/commands.py                    |   4 +-
 aim/ext/transport/handlers.py             |   4 +-
 aim/sdk/base_run.py                       |   5 +-
 aim/sdk/index_manager.py                  | 140 +++++++++++++-----
 aim/sdk/repo.py                           | 146 +++++++-----------
 aim/sdk/reporter/file_manager.py          |   6 +-
 aim/sdk/run.py                            |  14 +-
 aim/sdk/run_status_manager.py             |   8 +-
 aim/sdk/run_status_watcher.py             |   9 +-
 aim/sdk/sequence.py                       |   2 +-
 aim/sdk/uri_service.py                    |  10 +-
 aim/sdk/utils.py                          |   5 +-
 aim/storage/arrayview.py                  |  12 +-
 aim/storage/artifacts/artifact_storage.py |   9 +-
 aim/storage/inmemorytreeview.py           |   6 +-
 aim/storage/query.py                      |   3 +-
 aim/storage/structured/entities.py        | 171 ++++++++--------------
 aim/storage/treeview.py                   |  39 ++---
 aim/storage/treeviewproxy.py              |   4 -
 aim/storage/union.pyx                     |   5 +-
 aim/web/api/projects/project.py           |   1 -
 23 files changed, 265 insertions(+), 346 deletions(-)

diff --git a/aim/cli/runs/commands.py b/aim/cli/runs/commands.py
index 26a4b58d6..1696b3209 100644
--- a/aim/cli/runs/commands.py
+++ b/aim/cli/runs/commands.py
@@ -192,10 +192,10 @@ def update_metrics(ctx, yes):
     if not confirmed:
         return
 
-    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
+    index_manager = RepoIndexManager.get_index_manager(repo)
     hashes = repo.list_all_runs()
     for run_hash in tqdm.tqdm(hashes, desc='Updating runs', total=len(hashes)):
-        meta_tree = repo.request_tree('meta', run_hash, read_only=False, from_union=False)
+        meta_tree = repo.request_tree('meta', run_hash, read_only=False)
         meta_run_tree = meta_tree.subtree(('meta', 'chunks', run_hash))
         try:
             # check if the Run has already been updated.
diff --git a/aim/cli/storage/commands.py b/aim/cli/storage/commands.py
index 32bfe01d5..3210f7a69 100644
--- a/aim/cli/storage/commands.py
+++ b/aim/cli/storage/commands.py
@@ -51,7 +51,7 @@ def to_3_11(ctx, hashes, yes):
     if not confirmed:
         return
 
-    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
+    index_manager = RepoIndexManager.get_index_manager(repo)
     for run_hash in tqdm(matched_hashes):
         try:
             run = Run(run_hash, repo=repo)
@@ -97,7 +97,7 @@ def restore_runs(ctx, hashes, yes):
         return
 
     remaining_runs = []
-    index_manager = RepoIndexManager.get_index_manager(repo, disable_monitoring=True)
+    index_manager = RepoIndexManager.get_index_manager(repo)
     for run_hash in tqdm(matched_hashes):
         try:
             restore_run_backup(repo, run_hash)
diff --git a/aim/cli/up/commands.py b/aim/cli/up/commands.py
index 4775e0aa8..6ad5e6e73 100644
--- a/aim/cli/up/commands.py
+++ b/aim/cli/up/commands.py
@@ -124,7 +124,9 @@ def up(
     if profiler:
         os.environ[AIM_PROFILER_KEY] = '1'
 
-    RepoIndexManager.get_index_manager(repo_inst)
+    index_mng = RepoIndexManager.get_index_manager(repo_inst)
+    index_mng.start()
+
     run_status_mng = RunStatusManager(repo_inst)
     run_status_mng.start()
     try:
diff --git a/aim/ext/transport/handlers.py b/aim/ext/transport/handlers.py
index 7915bc105..23c9985d2 100644
--- a/aim/ext/transport/handlers.py
+++ b/aim/ext/transport/handlers.py
@@ -51,14 +51,12 @@ def get_tree(**kwargs):
     name = kwargs['name']
     sub = kwargs['sub']
     read_only = kwargs['read_only']
-    from_union = kwargs['from_union']
     index = kwargs['index']
     timeout = kwargs['timeout']
-    no_cache = kwargs.get('no_cache', False)
     if index:
         return ResourceRef(repo._get_index_tree(name, timeout))
     else:
-        return ResourceRef(repo.request_tree(name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache))
+        return ResourceRef(repo.request_tree(name, sub, read_only=read_only))
 
 
 def get_structured_run(hash_, read_only, created_at, **kwargs):
diff --git a/aim/sdk/base_run.py b/aim/sdk/base_run.py
index 89edf63b0..f77c435d8 100644
--- a/aim/sdk/base_run.py
+++ b/aim/sdk/base_run.py
@@ -39,6 +39,7 @@ def __init__(
         if self.read_only:
             assert run_hash is not None
             self.hash = run_hash
+            self.meta_tree: TreeView = self.repo.request_tree('meta', read_only=True).subtree('meta')
         else:
             if run_hash is None:
                 self.hash = generate_run_hash()
@@ -48,10 +49,8 @@ def __init__(
                 raise MissingRunError(f'Cannot find Run {run_hash} in aim Repo {self.repo.path}.')
             self._lock = self.repo.request_run_lock(self.hash)
             self._lock.lock(force=force_resume)
+            self.meta_tree: TreeView = self.repo.request_tree('meta', self.hash, read_only=False).subtree('meta')
 
-        self.meta_tree: TreeView = self.repo.request_tree(
-            'meta', self.hash, read_only=read_only, from_union=True
-        ).subtree('meta')
         self.meta_run_tree: TreeView = self.meta_tree.subtree('chunks').subtree(self.hash)
 
         self._series_run_trees: Dict[int, TreeView] = None
diff --git a/aim/sdk/index_manager.py b/aim/sdk/index_manager.py
index ff8eaa723..166e6ae0e 100644
--- a/aim/sdk/index_manager.py
+++ b/aim/sdk/index_manager.py
@@ -1,15 +1,19 @@
+import hashlib
 import logging
 import os
 import queue
 import threading
+import time
 
 from pathlib import Path
+from typing import Dict
 
 import aimrocks.errors
 
 from aim.sdk.repo import Repo
 from watchdog.events import FileSystemEventHandler
 from watchdog.observers import Observer
+from watchdog.observers.api import ObservedWatch
 from watchdog.observers.polling import PollingObserver
 
 
@@ -19,12 +23,17 @@
 class NewChunkCreatedHandler(FileSystemEventHandler):
     def __init__(self, manager):
         self.manager = manager
+        self.known_chunks = set(p.name for p in self.manager.chunks_dir.iterdir() if p.is_dir())
 
-    def on_created(self, event):
-        if event.is_directory and Path(event.src_path).parent == self.manager.chunks_dir:
-            chunk_name = os.path.basename(event.src_path)
-            logger.debug(f'Detected new chunk directory: {chunk_name}')
-            self.manager.monitor_chunk_directory(event.src_path)
+    def on_modified(self, event):
+        if event.is_directory and Path(event.src_path) == self.manager.chunks_dir:
+            current_chunks = set(p.name for p in self.manager.chunks_dir.iterdir() if p.is_dir())
+            new_chunks = current_chunks - self.known_chunks
+            for chunk_name in new_chunks:
+                chunk_path = self.manager.chunks_dir / chunk_name
+                logger.debug(f'Detected new chunk directory: {chunk_name}')
+                self.manager.monitor_chunk_directory(chunk_path)
+            self.known_chunks = current_chunks
 
 
 class ChunkChangedHandler(FileSystemEventHandler):
@@ -71,14 +80,14 @@ class RepoIndexManager:
     index_manager_pool = {}
 
     @classmethod
-    def get_index_manager(cls, repo: Repo, disable_monitoring: bool = False):
+    def get_index_manager(cls, repo: Repo):
         mng = cls.index_manager_pool.get(repo.path, None)
         if mng is None:
-            mng = RepoIndexManager(repo, disable_monitoring)
+            mng = RepoIndexManager(repo)
             cls.index_manager_pool[repo.path] = mng
         return mng
 
-    def __init__(self, repo: Repo, disable_monitoring: bool):
+    def __init__(self, repo: Repo):
         self.repo_path = repo.path
         self.repo = repo
         self.chunks_dir = Path(self.repo_path) / 'meta' / 'chunks'
@@ -86,35 +95,71 @@ def __init__(self, repo: Repo, disable_monitoring: bool):
 
         self._corrupted_runs = set()
 
-        if not disable_monitoring:
-            self.indexing_queue = queue.PriorityQueue()
-            self.lock = threading.Lock()
+        self.indexing_queue = queue.PriorityQueue()
+        self.lock = threading.Lock()
+
+        self.new_chunk_observer = Observer()
+        self.chunk_change_observer = PollingObserver()
+
+        self.new_chunk_handler = NewChunkCreatedHandler(self)
+        self.chunk_change_handler = ChunkChangedHandler(self)
+        self._watches: Dict[str, ObservedWatch] = dict()
+        self.new_chunk_observer.schedule(self.new_chunk_handler, self.chunks_dir, recursive=False)
 
-            self.new_chunk_observer = Observer()
-            self.chunk_change_observer = PollingObserver()
+        self._stop_event = threading.Event()
+        self._index_thread = None
+        self._monitor_thread = None
 
-            self.new_chunk_handler = NewChunkCreatedHandler(self)
-            self.chunk_change_handler = ChunkChangedHandler(self)
+    def start(self):
+        self._stop_event.clear()
+        self.new_chunk_observer.start()
+        self.chunk_change_observer.start()
 
-            self.new_chunk_observer.schedule(self.new_chunk_handler, self.chunks_dir, recursive=True)
-            self.new_chunk_observer.start()
+        if not self._index_thread or not self._index_thread.is_alive():
+            self._index_thread = threading.Thread(target=self._process_indexing_queue, daemon=True)
+            self._index_thread.start()
 
-            self._monitor_existing_chunks()
-            self.chunk_change_observer.start()
+        if not self._monitor_thread or not self._monitor_thread.is_alive():
+            self._monitor_thread = threading.Thread(target=self._monitor_existing_chunks, daemon=True)
+            self._monitor_thread.start()
 
-            self._reindex_thread = threading.Thread(target=self._process_queue, daemon=True)
-            self._reindex_thread.start()
+    def stop(self):
+        self._stop_event.set()
+        self.new_chunk_observer.stop()
+        self.chunk_change_observer.stop()
+        if self._monitor_thread:
+            self._monitor_thread.join()
+        if self._index_thread:
+            self._index_thread.join()
 
     def _monitor_existing_chunks(self):
-        for chunk_path in self.chunks_dir.iterdir():
-            if chunk_path.is_dir():
-                logger.debug(f'Monitoring existing chunk: {chunk_path}')
-                self.monitor_chunk_directory(chunk_path)
+        while not self._stop_event.is_set():
+            index_db = self.repo.request_tree('meta', read_only=True)
+            monitored_chunks = set(self._watches.keys())
+            for chunk_path in self.chunks_dir.iterdir():
+                if (
+                    chunk_path.is_dir()
+                    and chunk_path.name not in monitored_chunks
+                    and self._is_run_index_outdated(chunk_path.name, index_db)
+                ):
+                    logger.debug(f'Monitoring existing chunk: {chunk_path}')
+                    self.monitor_chunk_directory(chunk_path)
+                    logger.debug(f'Triggering indexing for run {chunk_path.name}')
+                    self.add_run_to_queue(chunk_path.name)
+            self.repo.container_pool.clear()
+            time.sleep(5)
+
+    def _stop_monitoring_chunk(self, run_hash):
+        watch = self._watches.pop(run_hash, None)
+        if watch:
+            self.chunk_change_observer.unschedule(watch)
+            logger.debug(f'Stopped monitoring chunk: {run_hash}')
 
     def monitor_chunk_directory(self, chunk_path):
         """Ensure chunk directory is monitored using a single handler."""
-        if str(chunk_path) not in self.chunk_change_observer._watches:
-            self.chunk_change_observer.schedule(self.chunk_change_handler, chunk_path, recursive=True)
+        if chunk_path.name not in self._watches:
+            watch = self.chunk_change_observer.schedule(self.chunk_change_handler, chunk_path, recursive=True)
+            self._watches[chunk_path.name] = watch
             logger.debug(f'Started monitoring chunk directory: {chunk_path}')
         else:
             logger.debug(f'Chunk directory already monitored: {chunk_path}')
@@ -127,8 +172,8 @@ def add_run_to_queue(self, run_hash):
             self.indexing_queue.put((timestamp, run_hash))
         logger.debug(f'Run {run_hash} added to indexing queue with timestamp {timestamp}')
 
-    def _process_queue(self):
-        while True:
+    def _process_indexing_queue(self):
+        while not self._stop_event.is_set():
             _, run_hash = self.indexing_queue.get()
             logger.debug(f'Indexing run {run_hash}...')
             self.index(run_hash)
@@ -137,12 +182,41 @@ def _process_queue(self):
     def index(self, run_hash):
         index = self.repo._get_index_tree('meta', 0).view(())
         try:
-            meta_tree = self.repo.request_tree(
-                'meta', run_hash, read_only=True, from_union=False, no_cache=True, skip_read_optimization=True
-            ).subtree('meta')
+            run_checksum = self._get_run_checksum(run_hash)
+            meta_tree = self.repo.request_tree('meta', run_hash, read_only=True, skip_read_optimization=True).subtree(
+                'meta'
+            )
             meta_run_tree = meta_tree.subtree('chunks').subtree(run_hash)
             meta_run_tree.finalize(index=index)
+            index['index_cache', run_hash] = run_checksum
+
+            if meta_run_tree.get('end_time') is not None:
+                logger.debug(f'Indexing thread detected finished run: {run_hash}. Stopping monitoring...')
+                self._stop_monitoring_chunk(run_hash)
+
         except (aimrocks.errors.RocksIOError, aimrocks.errors.Corruption):
-            logger.warning(f"Indexing thread detected corrupted run '{run_hash}'. Skipping.")
+            logger.warning(f'Indexing thread detected corrupted run: {run_hash}. Skipping.')
             self._corrupted_runs.add(run_hash)
         return True
+
+    def _is_run_index_outdated(self, run_hash, index_db):
+        return self._get_run_checksum(run_hash) != index_db.get(('index_cache', run_hash))
+
+    def _get_run_checksum(self, run_hash):
+        hash_obj = hashlib.md5()
+
+        for root, dirs, files in os.walk(os.path.join(self.chunks_dir, run_hash)):
+            for name in sorted(files):  # sort to ensure consistent order
+                if name.startswith('LOG'):  # skip access logs
+                    continue
+                filepath = os.path.join(root, name)
+                try:
+                    stat = os.stat(filepath)
+                    hash_obj.update(filepath.encode('utf-8'))
+                    hash_obj.update(str(stat.st_mtime).encode('utf-8'))
+                    hash_obj.update(str(stat.st_size).encode('utf-8'))
+                except FileNotFoundError:
+                    # File might have been deleted between os.walk and os.stat
+                    continue
+
+        return hash_obj.hexdigest()
diff --git a/aim/sdk/repo.py b/aim/sdk/repo.py
index 151a56f86..1ffef1c9b 100644
--- a/aim/sdk/repo.py
+++ b/aim/sdk/repo.py
@@ -8,6 +8,8 @@
 from typing import TYPE_CHECKING, Dict, Iterator, List, NamedTuple, Optional, Set, Tuple
 from weakref import WeakValueDictionary
 
+import aimrocks.errors
+
 from aim.ext.cleanup import AutoClean
 from aim.ext.sshfs.utils import mount_remote_repo, unmount_remote_repo
 from aim.ext.task_queue.queue import TaskQueue
@@ -127,9 +129,14 @@ def __init__(self, path: str, *, read_only: Optional[bool] = None, init: Optiona
             self.root_path = path
         self.path = os.path.join(self.root_path, get_aim_repo_name())
 
-        if init:
+        if init and not self.is_remote_repo:
             os.makedirs(self.path, exist_ok=True)
             os.makedirs(os.path.join(self.path, 'locks'), exist_ok=True)
+
+            # Make sure meta index db is created
+            path = os.path.join(self.path, 'meta', 'index')
+            RocksContainer(path, read_only=False)
+
         if not self.is_remote_repo and not os.path.exists(self.path):
             if self._mount_root:
                 unmount_remote_repo(self.root_path, self._mount_root)
@@ -137,7 +144,6 @@ def __init__(self, path: str, *, read_only: Optional[bool] = None, init: Optiona
 
         self.container_pool: Dict[ContainerConfig, Container] = WeakValueDictionary()
         self.persistent_pool: Dict[ContainerConfig, Container] = dict()
-        self.container_view_pool: Dict[ContainerConfig, Container] = WeakValueDictionary()
 
         self._run_props_cache_hint = None
         self._encryption_key = None
@@ -160,7 +166,7 @@ def __init__(self, path: str, *, read_only: Optional[bool] = None, init: Optiona
 
     @property
     def meta_tree(self):
-        return self.request_tree('meta', read_only=True, from_union=True).subtree('meta')
+        return self.request_tree('meta', read_only=True).subtree('meta')
 
     def __repr__(self) -> str:
         return f'<Repo#{hash(self)} path={self.path} read_only={self.read_only}>'
@@ -269,28 +275,6 @@ def get_version(cls, path: str):
     def is_remote_path(cls, path: str):
         return path.startswith('aim://')
 
-    def _get_container(
-        self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False
-    ) -> Container:
-        # TODO [AT]: refactor get container/tree logic to make it more simple
-        if self.read_only and not read_only:
-            raise ValueError('Repo is read-only')
-
-        container_config = ContainerConfig(name, None, read_only=read_only)
-        container = self.container_pool.get(container_config)
-        if container is None:
-            if from_union:
-                # Temporarily use index db when getting data from union.
-                path = os.path.join(self.path, name, 'index')
-                container = RocksContainer(path, read_only=read_only, skip_read_optimization=skip_read_optimization)
-                self.persistent_pool[container_config] = container
-            else:
-                path = os.path.join(self.path, name)
-                container = RocksContainer(path, read_only=read_only, skip_read_optimization=skip_read_optimization)
-            self.container_pool[container_config] = container
-
-        return container
-
     def _get_index_tree(self, name: str, timeout: int):
         if not self.is_remote_repo:
             return self._get_index_container(name, timeout).tree()
@@ -311,60 +295,30 @@ def _get_index_container(self, name: str, timeout: int) -> Container:
 
         return container
 
-    def request_tree(
-        self,
-        name: str,
-        sub: str = None,
-        *,
-        read_only: bool,
-        from_union: bool = False,  # TODO maybe = True by default
-        no_cache: bool = False,
-        skip_read_optimization: bool = False,
-    ):
+    def request_tree(self, name: str, sub: str = None, *, read_only: bool, skip_read_optimization: bool = False):
         if not self.is_remote_repo:
-            return self.request(
-                name,
-                sub,
-                read_only=read_only,
-                from_union=from_union,
-                no_cache=no_cache,
-                skip_read_optimization=skip_read_optimization,
+            return self.request_container(
+                name, sub, read_only=read_only, skip_read_optimization=skip_read_optimization
             ).tree()
         else:
-            return ProxyTree(self._client, name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache)
+            return ProxyTree(self._client, name, sub, read_only=read_only)
 
-    def request(
-        self,
-        name: str,
-        sub: str = None,
-        *,
-        read_only: bool,
-        from_union: bool = False,  # TODO maybe = True by default
-        no_cache: bool = False,
-        skip_read_optimization: bool = False,
-    ):
+    def request_container(self, name: str, sub: str = None, *, read_only: bool, skip_read_optimization: bool = False):
         container_config = ContainerConfig(name, sub, read_only)
-        container_view = self.container_view_pool.get(container_config)
-        if container_view is None or no_cache:
-            if read_only:
-                if from_union:
-                    path = name
-                else:
-                    assert sub is not None
-                    path = os.path.join(name, 'chunks', sub)
-                container = self._get_container(
-                    path, read_only=True, from_union=from_union, skip_read_optimization=skip_read_optimization
-                )
+        container = self.container_pool.get(container_config)
+        if container is None:
+            if sub is None:
+                try:
+                    path = os.path.join(self.path, name, 'index')
+                    container = RocksContainer(path, read_only=True, skip_read_optimization=skip_read_optimization)
+                except aimrocks.errors.RocksIOError:
+                    path = os.path.join(self.path, name)
+                    container = RocksUnionContainer(path, read_only=True)
             else:
-                assert sub is not None
-                path = os.path.join(name, 'chunks', sub)
-                container = self._get_container(path, read_only=False, from_union=False)
-
-            container_view = container
-            if not no_cache:
-                self.container_view_pool[container_config] = container_view
-
-        return container_view
+                path = os.path.join(self.path, name, 'chunks', sub)
+                container = RocksContainer(path, read_only=read_only, skip_read_optimization=skip_read_optimization)
+            self.container_pool[container_config] = container
+        return container
 
     def request_props(self, hash_: str, read_only: bool, created_at: 'datetime' = None):
         if self.is_remote_repo:
@@ -755,9 +709,6 @@ def encryption_key(self):
 
         return encryption_key
 
-    def _get_meta_tree(self):
-        return self.request_tree('meta', read_only=True, from_union=True).subtree('meta')
-
     @staticmethod
     def available_sequence_types():
         return Sequence.registry.keys()
@@ -779,7 +730,6 @@ def collect_sequence_info(self, sequence_types: Tuple[str, ...]) -> Dict[str, Di
         Returns:
             :obj:`dict`: Tree of sequences and their contexts groupped by sequence type.
         """
-        meta_tree = self._get_meta_tree()
         sequence_traces = {}
         if isinstance(sequence_types, str):
             sequence_types = (sequence_types,)
@@ -792,7 +742,7 @@ def collect_sequence_info(self, sequence_types: Tuple[str, ...]) -> Dict[str, Di
             dtype_traces = set()
             for dtype in dtypes:
                 try:
-                    dtype_trace_tree = meta_tree.collect(('traces_types', dtype))
+                    dtype_trace_tree = self.meta_tree.collect(('traces_types', dtype))
                     for ctx_id, seqs in dtype_trace_tree.items():
                         for seq_name in seqs.keys():
                             dtype_traces.add((ctx_id, seq_name))
@@ -800,7 +750,7 @@ def collect_sequence_info(self, sequence_types: Tuple[str, ...]) -> Dict[str, Di
                     pass
             if 'float' in dtypes:  # old sequences without dtype set are considered float sequences
                 try:
-                    dtype_trace_tree = meta_tree.collect('traces')
+                    dtype_trace_tree = self.meta_tree.collect('traces')
                     for ctx_id, seqs in dtype_trace_tree.items():
                         for seq_name in seqs.keys():
                             dtype_traces.add((ctx_id, seq_name))
@@ -808,7 +758,7 @@ def collect_sequence_info(self, sequence_types: Tuple[str, ...]) -> Dict[str, Di
                     pass
             traces_info = defaultdict(list)
             for ctx_id, seq_name in dtype_traces:
-                traces_info[seq_name].append(meta_tree['contexts', ctx_id])
+                traces_info[seq_name].append(self.meta_tree['contexts', ctx_id])
             sequence_traces[seq_type] = traces_info
         return sequence_traces
 
@@ -818,9 +768,8 @@ def collect_params_info(self) -> dict:
         Returns:
             :obj:`dict`: All runs meta-parameters.
         """
-        meta_tree = self._get_meta_tree()
         try:
-            return meta_tree.collect('attrs', strict=False)
+            return self.meta_tree.collect('attrs', strict=False)
         except KeyError:
             return {}
 
@@ -891,22 +840,13 @@ def _delete_run(self, run_hash):
     def _copy_run(self, run_hash, dest_repo):
         def copy_trees():
             # copy run meta tree
-            source_meta_tree = self.request_tree(
-                'meta', run_hash, read_only=True, from_union=False, no_cache=True
-            ).subtree('meta')
-            dest_meta_tree = dest_repo.request_tree(
-                'meta', run_hash, read_only=False, from_union=False, no_cache=True
-            ).subtree('meta')
-            dest_meta_run_tree = dest_meta_tree.subtree('chunks').subtree(run_hash)
+            source_meta_tree = self.request_tree('meta', run_hash, read_only=True).subtree('meta')
+            dest_meta_tree = dest_repo.request_tree('meta', run_hash, read_only=False).subtree('meta')
             dest_meta_tree[...] = source_meta_tree[...]
-            dest_index = dest_repo._get_index_tree('meta', timeout=10).view(())
-            dest_meta_run_tree.finalize(index=dest_index)
 
             # copy run series tree
-            source_series_run_tree = self.request_tree('seqs', run_hash, read_only=True, no_cache=True).subtree('seqs')
-            dest_series_run_tree = dest_repo.request_tree('seqs', run_hash, read_only=False, no_cache=True).subtree(
-                'seqs'
-            )
+            source_series_run_tree = self.request_tree('seqs', run_hash, read_only=True).subtree('seqs')
+            dest_series_run_tree = dest_repo.request_tree('seqs', run_hash, read_only=False).subtree('seqs')
 
             # copy v2 sequences
             source_v2_tree = source_series_run_tree.subtree(('v2', 'chunks', run_hash))
@@ -1014,6 +954,10 @@ def _restore_run(self, run_hash):
             restore_run_backup(self, run_hash)
 
     def _close_run(self, run_hash):
+        import datetime
+
+        import pytz
+
         def optimize_container(path, extra_options):
             rc = RocksContainer(path, read_only=True, **extra_options)
             rc.optimize_for_read()
@@ -1024,6 +968,16 @@ def optimize_container(path, extra_options):
         lock_manager = LockManager(self.path)
 
         if lock_manager.release_locks(run_hash, force=True):
+            # Set run end time if locks are removed
+            meta_tree = self.request_tree(
+                'meta',
+                run_hash,
+                read_only=False,
+            ).subtree('meta')
+            meta_run_tree = meta_tree.subtree('chunks').subtree(run_hash)
+            if not meta_run_tree.get('end_time'):
+                meta_run_tree['end_time'] = datetime.datetime.now(pytz.utc).timestamp()
+
             # Run rocksdb optimizations if container locks are removed
             meta_db_path = os.path.join(self.path, 'meta', 'chunks', run_hash)
             seqs_db_path = os.path.join(self.path, 'seqs', 'chunks', run_hash)
@@ -1039,7 +993,7 @@ def _recreate_index(self):
 
         from aim.sdk.index_manager import RepoIndexManager
 
-        index_manager = RepoIndexManager.get_index_manager(self, disable_monitoring=True)
+        index_manager = RepoIndexManager.get_index_manager(self)
 
         # force delete the index db and the locks
 
diff --git a/aim/sdk/reporter/file_manager.py b/aim/sdk/reporter/file_manager.py
index 72633f084..80c2d9a85 100644
--- a/aim/sdk/reporter/file_manager.py
+++ b/aim/sdk/reporter/file_manager.py
@@ -10,12 +10,10 @@
 
 class FileManager(object):
     @abstractmethod
-    def poll(self, pattern: str) -> Optional[str]:
-        ...
+    def poll(self, pattern: str) -> Optional[str]: ...
 
     @abstractmethod
-    def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None):
-        ...
+    def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None): ...
 
 
 class LocalFileManager(FileManager):
diff --git a/aim/sdk/run.py b/aim/sdk/run.py
index 775aed973..b53bdf72a 100644
--- a/aim/sdk/run.py
+++ b/aim/sdk/run.py
@@ -82,9 +82,9 @@ def __init__(self, instance: 'Run') -> None:
     def add_extra_resource(self, resource) -> None:
         self.extra_resources.append(resource)
 
-    def finalize_run(self):
+    def set_run_end_time(self):
         """
-        Finalize the run by indexing all the data.
+        Set Run end_time to mark it as finished.
         """
         self.meta_run_tree['end_time'] = datetime.datetime.now(pytz.utc).timestamp()
 
@@ -94,7 +94,7 @@ def empty_rpc_queue(self):
 
     def _close(self) -> None:
         """
-        Close the `Run` instance resources and trigger indexing.
+        Close the `Run` instance resources.
         """
         if self.read_only:
             logger.debug(f'Run {self.hash} is read-only, skipping cleanup')
@@ -104,7 +104,7 @@ def _close(self) -> None:
             res.close()
 
         self.empty_rpc_queue()
-        self.finalize_run()
+        self.set_run_end_time()
         if self._heartbeat is not None:
             self._heartbeat.stop()
         if self._checkins is not None:
@@ -725,12 +725,6 @@ def close(self):
         self._props = None
         self._cleanup_trees()
 
-    def finalize(self):
-        if self._resources is None:
-            return
-
-        self._resources.finalize_run()
-
     def dataframe(
         self,
         include_props: bool = True,
diff --git a/aim/sdk/run_status_manager.py b/aim/sdk/run_status_manager.py
index 71dc42eeb..e1fa6f3fc 100644
--- a/aim/sdk/run_status_manager.py
+++ b/aim/sdk/run_status_manager.py
@@ -1,13 +1,13 @@
-import time
-import os
 import datetime
-import pytz
+import os
 import threading
-from pathlib import Path
+import time
 
+from pathlib import Path
 from typing import Iterable
 
 import aimrocks.errors
+import pytz
 
 from aim import Repo
 from aim.sdk.run_status_watcher import Event
diff --git a/aim/sdk/run_status_watcher.py b/aim/sdk/run_status_watcher.py
index ccf203bd5..422cbff12 100644
--- a/aim/sdk/run_status_watcher.py
+++ b/aim/sdk/run_status_watcher.py
@@ -83,16 +83,13 @@ def __init__(self, *, obj_idx: Optional[str] = None, rank: Optional[int] = None,
         self.message = message
 
     @abstractmethod
-    def is_sent(self):
-        ...
+    def is_sent(self): ...
 
     @abstractmethod
-    def update_last_sent(self):
-        ...
+    def update_last_sent(self): ...
 
     @abstractmethod
-    def get_msg_details(self):
-        ...
+    def get_msg_details(self): ...
 
 
 class StatusNotification(Notification):
diff --git a/aim/sdk/sequence.py b/aim/sdk/sequence.py
index de8c78e1d..dde9e215f 100644
--- a/aim/sdk/sequence.py
+++ b/aim/sdk/sequence.py
@@ -201,7 +201,7 @@ def numpy(self) -> Tuple[np.ndarray, List[np.ndarray]]:
         sort_indices = steps.argsort()
         columns = [arr[sort_indices] for arr in columns]
         steps = steps[sort_indices]
-        if last_step is not None and last_step != steps[-1]:
+        if last_step is not None and last_step > steps[-1]:
             step_hash = self.step_hash(last_step)
             # The `last_step` is provided by the meta tree which may potentially
             # be out of sync with the series tree.
diff --git a/aim/sdk/uri_service.py b/aim/sdk/uri_service.py
index 10d588918..062c05ac6 100644
--- a/aim/sdk/uri_service.py
+++ b/aim/sdk/uri_service.py
@@ -55,7 +55,7 @@ def request_batch(self, uri_batch: List[str]) -> Iterator[Dict[str, bytes]]:
             for uri, sub_name, resource_path in self.runs_pool[run_name]:
                 container = run_containers.get(sub_name)
                 if not container:
-                    container = self._get_container(run_name, sub_name)
+                    container = self.repo.request_container(sub_name, run_name, read_only=True)
                     run_containers[sub_name] = container
 
                 resource_path = decode_path(bytes.fromhex(resource_path))
@@ -70,11 +70,3 @@ def request_batch(self, uri_batch: List[str]) -> Iterator[Dict[str, bytes]]:
 
         # clear runs pool
         self.runs_pool.clear()
-
-    def _get_container(self, run_name: str, sub_name: str):
-        if sub_name == 'meta':
-            container = self.repo.request(sub_name, run_name, from_union=True, read_only=True)
-        else:
-            container = self.repo.request(sub_name, run_name, read_only=True)
-
-        return container
diff --git a/aim/sdk/utils.py b/aim/sdk/utils.py
index 6863600f9..0e5ff84fa 100644
--- a/aim/sdk/utils.py
+++ b/aim/sdk/utils.py
@@ -165,13 +165,12 @@ def flatten(d, parent_path=None):
         return all_paths
 
     subtrees_to_lookup = ('attrs', 'traces_types', 'contexts', 'traces')
-    repo_meta_tree = repo._get_meta_tree()
 
     # set of all repo paths that can be left dangling after run deletion
     repo_paths = set()
     for key in subtrees_to_lookup:
         try:
-            repo_paths.update(flatten(repo_meta_tree.collect(key, strict=False), parent_path=(key,)))
+            repo_paths.update(flatten(repo.meta_tree.collect(key, strict=False), parent_path=(key,)))
         except KeyError:
             pass
 
@@ -179,7 +178,7 @@ def flatten(d, parent_path=None):
     for run_hash in tqdm(run_hashes):
         # construct unique paths set for each run
         run_paths = set()
-        run_meta_tree = repo.request_tree('meta', run_hash, from_union=False, read_only=True).subtree('meta')
+        run_meta_tree = repo.request_tree('meta', run_hash, read_only=True).subtree('meta')
         for key in subtrees_to_lookup:
             try:
                 run_paths.update(flatten(run_meta_tree.collect(key, strict=False), parent_path=(key,)))
diff --git a/aim/storage/arrayview.py b/aim/storage/arrayview.py
index 2b9fd8954..4694c1eab 100644
--- a/aim/storage/arrayview.py
+++ b/aim/storage/arrayview.py
@@ -9,8 +9,7 @@ class ArrayView:
     when index values are not important.
     """
 
-    def __iter__(self) -> Iterator[Any]:
-        ...
+    def __iter__(self) -> Iterator[Any]: ...
 
     def keys(self) -> Iterator[int]:
         """Return sparse indices iterator.
@@ -44,16 +43,13 @@ def items(self) -> Iterator[Tuple[int, Any]]:
         """
         ...
 
-    def __len__(self) -> int:
-        ...
+    def __len__(self) -> int: ...
 
-    def __getitem__(self, idx: Union[int, slice]):
-        ...
+    def __getitem__(self, idx: Union[int, slice]): ...
 
     # TODO implement append
 
-    def __setitem__(self, idx: int, val: Any):
-        ...
+    def __setitem__(self, idx: int, val: Any): ...
 
     def sparse_list(self) -> Tuple[List[int], List[Any]]:
         """Get sparse indices and values as :obj:`list`s."""
diff --git a/aim/storage/artifacts/artifact_storage.py b/aim/storage/artifacts/artifact_storage.py
index e0bab8934..efa73cbd1 100644
--- a/aim/storage/artifacts/artifact_storage.py
+++ b/aim/storage/artifacts/artifact_storage.py
@@ -7,13 +7,10 @@ def __init__(self, url: str):
         self.url = url
 
     @abstractmethod
-    def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False):
-        ...
+    def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False): ...
 
     @abstractmethod
-    def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str:
-        ...
+    def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str: ...
 
     @abstractmethod
-    def delete_artifact(self, artifact_path: str):
-        ...
+    def delete_artifact(self, artifact_path: str): ...
diff --git a/aim/storage/inmemorytreeview.py b/aim/storage/inmemorytreeview.py
index 1ce208594..7d02c347d 100644
--- a/aim/storage/inmemorytreeview.py
+++ b/aim/storage/inmemorytreeview.py
@@ -117,8 +117,6 @@ def iterlevel(
     def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> TreeArrayView:
         return TreeArrayView(self.subtree(path), dtype=dtype)
 
-    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
-        ...
+    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
 
-    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
-        ...
+    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
diff --git a/aim/storage/query.py b/aim/storage/query.py
index f8fa81fbb..82de23657 100644
--- a/aim/storage/query.py
+++ b/aim/storage/query.py
@@ -80,8 +80,7 @@ def __init__(self, expr: str):
         self.expr = expr
 
     @abstractmethod
-    def check(self, **params) -> bool:
-        ...
+    def check(self, **params) -> bool: ...
 
     def __call__(self, **params):
         return self.check(**params)
diff --git a/aim/storage/structured/entities.py b/aim/storage/structured/entities.py
index a43471ea7..900c422ec 100644
--- a/aim/storage/structured/entities.py
+++ b/aim/storage/structured/entities.py
@@ -13,281 +13,224 @@
 class StructuredObject(ABC):
     @classmethod
     @abstractmethod
-    def fields(cls):
-        ...
+    def fields(cls): ...
 
 
 class Searchable(ABC, Generic[T]):
     @classmethod
     @abstractmethod
-    def find(cls, _id: str, **kwargs) -> Optional[T]:
-        ...
+    def find(cls, _id: str, **kwargs) -> Optional[T]: ...
 
     @classmethod
     @abstractmethod
-    def all(cls, **kwargs) -> Collection[T]:
-        ...
+    def all(cls, **kwargs) -> Collection[T]: ...
 
     @classmethod
     @abstractmethod
-    def search(cls, term: str, **kwargs) -> Collection[T]:
-        ...
+    def search(cls, term: str, **kwargs) -> Collection[T]: ...
 
 
 class Run(StructuredObject, Searchable['Run']):
     @property
     @abstractmethod
-    def hash(self) -> str:
-        ...
+    def hash(self) -> str: ...
 
     @property
     @abstractmethod
-    def name(self) -> Optional[str]:
-        ...
+    def name(self) -> Optional[str]: ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str):
-        ...
+    def name(self, value: str): ...
 
     @property
     @abstractmethod
-    def description(self) -> Optional[str]:
-        ...
+    def description(self) -> Optional[str]: ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str):
-        ...
+    def description(self, value: str): ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool:
-        ...
+    def archived(self) -> bool: ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool):
-        ...
+    def archived(self, value: bool): ...
 
     @property
     @abstractmethod
-    def experiment(self) -> Optional['Experiment']:
-        ...
+    def experiment(self) -> Optional['Experiment']: ...
 
     @experiment.setter
     @abstractmethod
-    def experiment(self, value: str):
-        ...
+    def experiment(self, value: str): ...
 
     @property
     @abstractmethod
-    def tags(self) -> TagCollection:
-        ...
+    def tags(self) -> TagCollection: ...
 
     @abstractmethod
-    def add_tag(self, value: str) -> 'Tag':
-        ...
+    def add_tag(self, value: str) -> 'Tag': ...
 
     @abstractmethod
-    def remove_tag(self, tag_name: str) -> bool:
-        ...
+    def remove_tag(self, tag_name: str) -> bool: ...
 
     @property
     @abstractmethod
-    def info(self) -> 'RunInfo':
-        ...
+    def info(self) -> 'RunInfo': ...
 
 
 class Experiment(StructuredObject, Searchable['Experiment']):
     @property
     @abstractmethod
-    def uuid(self) -> str:
-        ...
+    def uuid(self) -> str: ...
 
     @property
     @abstractmethod
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str):
-        ...
+    def name(self, value: str): ...
 
     @property
     @abstractmethod
-    def description(self) -> Optional[str]:
-        ...
+    def description(self) -> Optional[str]: ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str):
-        ...
+    def description(self, value: str): ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool:
-        ...
+    def archived(self) -> bool: ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool):
-        ...
+    def archived(self, value: bool): ...
 
     @property
     @abstractmethod
-    def runs(self) -> RunCollection:
-        ...
+    def runs(self) -> RunCollection: ...
 
 
 class Tag(StructuredObject, Searchable['Tag']):
     @property
     @abstractmethod
-    def uuid(self) -> str:
-        ...
+    def uuid(self) -> str: ...
 
     @property
     @abstractmethod
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
     @name.setter
     @abstractmethod
-    def name(self, value: str):
-        ...
+    def name(self, value: str): ...
 
     @property
     @abstractmethod
-    def color(self) -> str:
-        ...
+    def color(self) -> str: ...
 
     @color.setter
     @abstractmethod
-    def color(self, value: str):
-        ...
+    def color(self, value: str): ...
 
     @property
     @abstractmethod
-    def description(self) -> str:
-        ...
+    def description(self) -> str: ...
 
     @description.setter
     @abstractmethod
-    def description(self, value: str):
-        ...
+    def description(self, value: str): ...
 
     @property
     @abstractmethod
-    def archived(self) -> bool:
-        ...
+    def archived(self) -> bool: ...
 
     @archived.setter
     @abstractmethod
-    def archived(self, value: bool):
-        ...
+    def archived(self, value: bool): ...
 
     @property
     @abstractmethod
-    def runs(self) -> RunCollection:
-        ...
+    def runs(self) -> RunCollection: ...
 
 
 class Note(StructuredObject, Searchable['Note']):
     @property
     @abstractmethod
-    def id(self) -> int:
-        ...
+    def id(self) -> int: ...
 
     @property
     @abstractmethod
-    def content(self) -> str:
-        ...
+    def content(self) -> str: ...
 
     @content.setter
     @abstractmethod
-    def content(self, value: str):
-        ...
+    def content(self, value: str): ...
 
     @property
     @abstractmethod
-    def run(self) -> int:
-        ...
+    def run(self) -> int: ...
 
 
 class RunInfo(StructuredObject, Generic[T]):
     @property
     @abstractmethod
-    def last_notification_index(self) -> int:
-        ...
+    def last_notification_index(self) -> int: ...
 
     @last_notification_index.setter
     @abstractmethod
-    def last_notification_index(self, value: int):
-        ...
+    def last_notification_index(self, value: int): ...
 
 
 class ObjectFactory:
     @abstractmethod
-    def runs(self) -> RunCollection:
-        ...
+    def runs(self) -> RunCollection: ...
 
     @abstractmethod
-    def search_runs(self, term: str) -> RunCollection:
-        ...
+    def search_runs(self, term: str) -> RunCollection: ...
 
     @abstractmethod
-    def find_run(self, _id: str) -> Run:
-        ...
+    def find_run(self, _id: str) -> Run: ...
 
     @abstractmethod
-    def find_runs(self, ids: List[str]) -> List[Run]:
-        ...
+    def find_runs(self, ids: List[str]) -> List[Run]: ...
 
     @abstractmethod
-    def create_run(self, runhash: str) -> Run:
-        ...
+    def create_run(self, runhash: str) -> Run: ...
 
     @abstractmethod
-    def delete_run(self, runhash: str) -> bool:
-        ...
+    def delete_run(self, runhash: str) -> bool: ...
 
     @abstractmethod
-    def experiments(self) -> ExperimentCollection:
-        ...
+    def experiments(self) -> ExperimentCollection: ...
 
     @abstractmethod
-    def search_experiments(self, term: str) -> ExperimentCollection:
-        ...
+    def search_experiments(self, term: str) -> ExperimentCollection: ...
 
     @abstractmethod
-    def find_experiment(self, _id: str) -> Experiment:
-        ...
+    def find_experiment(self, _id: str) -> Experiment: ...
 
     @abstractmethod
-    def create_experiment(self, name: str) -> Experiment:
-        ...
+    def create_experiment(self, name: str) -> Experiment: ...
 
     @abstractmethod
-    def delete_experiment(self, _id: str) -> bool:
-        ...
+    def delete_experiment(self, _id: str) -> bool: ...
 
     @abstractmethod
-    def tags(self) -> TagCollection:
-        ...
+    def tags(self) -> TagCollection: ...
 
     @abstractmethod
-    def search_tags(self, term: str) -> TagCollection:
-        ...
+    def search_tags(self, term: str) -> TagCollection: ...
 
     @abstractmethod
-    def find_tag(self, _id: str) -> Tag:
-        ...
+    def find_tag(self, _id: str) -> Tag: ...
 
     @abstractmethod
-    def create_tag(self, name: str) -> Tag:
-        ...
+    def create_tag(self, name: str) -> Tag: ...
 
     @abstractmethod
-    def delete_tag(self, name: str) -> bool:
-        ...
+    def delete_tag(self, name: str) -> bool: ...
diff --git a/aim/storage/treeview.py b/aim/storage/treeview.py
index f80beff50..fc05a06f6 100644
--- a/aim/storage/treeview.py
+++ b/aim/storage/treeview.py
@@ -8,26 +8,21 @@
 
 
 class TreeView:
-    def preload(self):
-        ...
+    def preload(self): ...
 
-    def finalize(self, index: 'TreeView'):
-        ...
+    def finalize(self, index: 'TreeView'): ...
 
     def subtree(self, path: Union[AimObjectKey, AimObjectPath]) -> 'TreeView':
         # Default to:
         return self.view(path, resolve=False)
 
-    def view(self, path: Union[AimObjectKey, AimObjectPath], resolve: bool = False):
-        ...
+    def view(self, path: Union[AimObjectKey, AimObjectPath], resolve: bool = False): ...
 
-    def make_array(self, path: Union[AimObjectKey, AimObjectPath] = ()):
-        ...
+    def make_array(self, path: Union[AimObjectKey, AimObjectPath] = ()): ...
 
     def collect(
         self, path: Union[AimObjectKey, AimObjectPath] = (), strict: bool = True, resolve_objects: bool = False
-    ) -> AimObject:
-        ...
+    ) -> AimObject: ...
 
     def __getitem__(self, path: Union[AimObjectKey, AimObjectPath]) -> AimObject:
         return self.collect(path)
@@ -38,8 +33,7 @@ def get(self, path: Union[AimObjectKey, AimObjectPath] = (), default: Any = None
         except KeyError:
             return default
 
-    def __delitem__(self, path: Union[AimObjectKey, AimObjectPath]):
-        ...
+    def __delitem__(self, path: Union[AimObjectKey, AimObjectPath]): ...
 
     def set(self, path: Union[AimObjectKey, AimObjectPath], value: AimObject, strict: bool = True):
         self.__setitem__(path, value)
@@ -51,25 +45,18 @@ def __setitem__(self, path: Union[AimObjectKey, AimObjectPath], value: AimObject
     def keys_eager(
         self,
         path: Union[AimObjectKey, AimObjectPath] = (),
-    ):
-        ...
+    ): ...
 
     def keys(
         self, path: Union[AimObjectKey, AimObjectPath] = (), level: int = None
-    ) -> Iterator[Union[AimObjectPath, AimObjectKey]]:
-        ...
+    ) -> Iterator[Union[AimObjectPath, AimObjectKey]]: ...
 
-    def items_eager(self, path: Union[AimObjectKey, AimObjectPath] = ()):
-        ...
+    def items_eager(self, path: Union[AimObjectKey, AimObjectPath] = ()): ...
 
-    def items(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> Iterator[Tuple[AimObjectKey, AimObject]]:
-        ...
+    def items(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> Iterator[Tuple[AimObjectKey, AimObject]]: ...
 
-    def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> 'ArrayView':
-        ...
+    def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> 'ArrayView': ...
 
-    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
-        ...
+    def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
 
-    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
-        ...
+    def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
diff --git a/aim/storage/treeviewproxy.py b/aim/storage/treeviewproxy.py
index f459a096a..d2e188e84 100644
--- a/aim/storage/treeviewproxy.py
+++ b/aim/storage/treeviewproxy.py
@@ -24,8 +24,6 @@ def __init__(
         sub: str,
         *,
         read_only: bool,
-        from_union: bool = False,
-        no_cache: bool = False,
         index=False,
         timeout=None,
     ):
@@ -38,10 +36,8 @@ def __init__(
             'name': name,
             'sub': sub,
             'read_only': read_only,
-            'from_union': from_union,
             'index': index,
             'timeout': timeout,
-            'no_cache': no_cache,
         }
         self.init_args = pack_args(encode_tree(kwargs))
         self.resource_type = 'TreeView'
diff --git a/aim/storage/union.pyx b/aim/storage/union.pyx
index 2d5729c75..e9bafc577 100644
--- a/aim/storage/union.pyx
+++ b/aim/storage/union.pyx
@@ -242,11 +242,8 @@ class DB(object):
             index_db = None
             logger.info('No index was detected')
 
-        # If index exists -- only load those in progress
-        selector = 'progress' if index_db is not None else 'chunks'
-
         new_dbs: Dict[bytes, aimrocks.DB] = {}
-        db_dir = os.path.join(self.db_path, self.db_name, selector)
+        db_dir = os.path.join(self.db_path, self.db_name, 'chunks')
         for prefix in self._list_dir(db_dir):
             path = os.path.join(self.db_path, self.db_name, "chunks", prefix)
             prefix = encode_path((self.db_name, "chunks", prefix))
diff --git a/aim/web/api/projects/project.py b/aim/web/api/projects/project.py
index 2fa29ee7a..b1ae57eba 100644
--- a/aim/web/api/projects/project.py
+++ b/aim/web/api/projects/project.py
@@ -18,7 +18,6 @@ def __init__(self):
 
     def cleanup_repo_pools(self):
         self.repo.container_pool.clear()
-        self.repo.container_view_pool.clear()
         self.repo.persistent_pool.clear()
 
     def cleanup_sql_caches(self):

From a1a233b55aa6849079c3e34b356eeb59bfec72c6 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <torosyanalbert@gmail.com>
Date: Thu, 8 May 2025 11:19:02 +0400
Subject: [PATCH 28/30] Bump up Aim to v3.29.0

---
 CHANGELOG.md            | 9 ++++++++-
 aim/VERSION             | 2 +-
 aim/web/ui/package.json | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d55c7567e..6518d276c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,16 @@
 # Changelog
 
-## Unreleased:
+## 3.29.0 May 8, 2025:
+
+### Enhancements:
+- Constant indexing of in-progress runs (alberttorosyan)
+- Fallback to union view if index db is missing (alberttorosyan, mihran113)
+
 
 ### Fixes:
 - Fix min/max calculation for single point metrics (mihran113)
+- Aim web ui integration in jupyter/colab (larissapoghosyan)
+
 
 ## 3.28.0 Mar 21, 2025
 
diff --git a/aim/VERSION b/aim/VERSION
index a72fd67b6..c7c977326 100644
--- a/aim/VERSION
+++ b/aim/VERSION
@@ -1 +1 @@
-3.28.0
+3.29.0
diff --git a/aim/web/ui/package.json b/aim/web/ui/package.json
index c9a9976a0..d5519f277 100644
--- a/aim/web/ui/package.json
+++ b/aim/web/ui/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ui_v2",
-  "version": "3.28.0",
+  "version": "3.29.0",
   "private": true,
   "dependencies": {
     "@aksel/structjs": "^1.0.0",

From 753f4b18437b8288e1c6f7c894c14a33cba9e7d0 Mon Sep 17 00:00:00 2001
From: Albert Torosyan <torosyanalbert@gmail.com>
Date: Thu, 8 May 2025 13:42:47 +0400
Subject: [PATCH 29/30] Bump up Aim to v3.29.1

---
 .github/workflows/python-package.yml | 1 +
 CHANGELOG.md                         | 4 +++-
 aim/VERSION                          | 2 +-
 aim/web/ui/package.json              | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c414f5536..f8116d54a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -96,6 +96,7 @@ jobs:
           python -m pip install -r requirements.txt
 
       - name: Build bdist wheels for 'cp37-cp37m'
+        if: matrix.manylinux-version == 'manylinux_2_24_x86_64'
         uses: nick-fields/retry@v2
         with:
           max_attempts: 3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6518d276c..f3b114a0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## 3.29.0 May 8, 2025:
+## 3.29.1 May 8, 2025:
 
 ### Enhancements:
 - Constant indexing of in-progress runs (alberttorosyan)
@@ -10,7 +10,9 @@
 ### Fixes:
 - Fix min/max calculation for single point metrics (mihran113)
 - Aim web ui integration in jupyter/colab (larissapoghosyan)
+- Package publishing for Linux/Python 3.7 (alberttorosyan)
 
+## 3.29.0 May 8, 2025 (Yanked)
 
 ## 3.28.0 Mar 21, 2025
 
diff --git a/aim/VERSION b/aim/VERSION
index c7c977326..1002be7fb 100644
--- a/aim/VERSION
+++ b/aim/VERSION
@@ -1 +1 @@
-3.29.0
+3.29.1
diff --git a/aim/web/ui/package.json b/aim/web/ui/package.json
index d5519f277..99ebb2bb8 100644
--- a/aim/web/ui/package.json
+++ b/aim/web/ui/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ui_v2",
-  "version": "3.29.0",
+  "version": "3.29.1",
   "private": true,
   "dependencies": {
     "@aksel/structjs": "^1.0.0",

From d67e7663fad36ba57705723d46d16ef0c6240007 Mon Sep 17 00:00:00 2001
From: mihran113 <vanyanmihran@gmail.com>
Date: Thu, 26 Jun 2025 18:30:56 +0400
Subject: [PATCH 30/30] [fix] Resolve issues with false tag reassignment
 (#3344)

---
 CHANGELOG.md                                  |  5 ++
 aim/sdk/data_version.py                       |  2 +-
 .../migrations/versions/661514b12ee1_.py      | 69 +++++++++++++++++++
 aim/storage/structured/db.py                  |  3 +-
 aim/storage/structured/sql_engine/entities.py |  8 +--
 aim/storage/structured/sql_engine/models.py   |  8 ++-
 6 files changed, 85 insertions(+), 10 deletions(-)
 create mode 100644 aim/storage/migrations/versions/661514b12ee1_.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f3b114a0a..bf3ba2d6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Unreleased:
+
+### Fixes: 
+- Fix issues with tag false reassignment (mihran113)
+
 ## 3.29.1 May 8, 2025:
 
 ### Enhancements:
diff --git a/aim/sdk/data_version.py b/aim/sdk/data_version.py
index 55f4f52d6..4c496ddb5 100644
--- a/aim/sdk/data_version.py
+++ b/aim/sdk/data_version.py
@@ -1 +1 @@
-DATA_VERSION = (1, 3)
+DATA_VERSION = (1, 4)
diff --git a/aim/storage/migrations/versions/661514b12ee1_.py b/aim/storage/migrations/versions/661514b12ee1_.py
new file mode 100644
index 000000000..eacfccfe8
--- /dev/null
+++ b/aim/storage/migrations/versions/661514b12ee1_.py
@@ -0,0 +1,69 @@
+"""empty message
+
+Revision ID: 661514b12ee1
+Revises: 46b89d830ad8
+Create Date: 2025-06-05 19:52:31.221392
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from alembic.context import get_context
+
+
+# revision identifiers, used by Alembic.
+revision = '661514b12ee1'
+down_revision = '46b89d830ad8'
+branch_labels = None
+depends_on = None
+
+
+
+def upgrade():
+    # Get the SQLite connection context
+    context = get_context()
+    naming_convention = {
+        "fk":
+            "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
+    }
+    # Use batch operations for SQLite
+    with op.batch_alter_table('run_tag', naming_convention=naming_convention) as batch_op:
+        # First drop the existing foreign key
+        batch_op.drop_constraint('fk_run_tag_run_id_run', type_='foreignkey')
+        batch_op.drop_constraint('fk_run_tag_tag_id_tag', type_='foreignkey')
+
+        # Then create a new one with CASCADE
+        batch_op.create_foreign_key('fk_run_tag_run_id_run', 'run', ['run_id'], ['id'], ondelete='CASCADE')
+        batch_op.create_foreign_key('fk_run_tag_tag_id_tag', 'tag', ['tag_id'], ['id'], ondelete='CASCADE')
+
+
+    with op.batch_alter_table('note', naming_convention=naming_convention) as batch_op:
+        # First drop the existing foreign key
+        batch_op.drop_constraint('fk_note_run_id_run', type_='foreignkey')
+
+        # Then create a new one with CASCADE
+        batch_op.create_foreign_key('fk_note_run_id_run', 'run', ['run_id'], ['id'], ondelete='CASCADE')
+
+
+def downgrade():
+    # Use batch operations for SQLite
+    naming_convention = {
+        "fk":
+            "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
+    }
+    # Use batch operations for SQLite
+    with op.batch_alter_table('run_tag', naming_convention=naming_convention) as batch_op:
+        # Drop the CASCADE foreign key
+        batch_op.drop_constraint('fk_run_tag_run_id_run', type_='foreignkey')
+        batch_op.drop_constraint('fk_run_tag_tag_id_tag', type_='foreignkey')
+
+        # Then create a new one with CASCADE
+        batch_op.create_foreign_key('fk_run_tag_run_id_run', 'run', ['run_id'], ['id'],)
+        batch_op.create_foreign_key('fk_run_tag_tag_id_tag', 'tag', ['tag_id'], ['id'],)
+
+    with op.batch_alter_table('note', naming_convention=naming_convention) as batch_op:
+        # First drop the existing foreign key
+        batch_op.drop_constraint('fk_note_run_id_run', type_='foreignkey')
+
+        # Then create a new one with CASCADE
+        batch_op.create_foreign_key('fk_note_run_id_run', 'run', ['run_id'], ['id'],)
+
diff --git a/aim/storage/structured/db.py b/aim/storage/structured/db.py
index 830c0bc41..cf0087a57 100644
--- a/aim/storage/structured/db.py
+++ b/aim/storage/structured/db.py
@@ -9,7 +9,7 @@
 )
 from aim.storage.types import SafeNone
 from aim.web.configs import AIM_LOG_LEVEL_KEY
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, event
 from sqlalchemy.orm import scoped_session, sessionmaker
 
 
@@ -66,6 +66,7 @@ def __init__(self, path: str, readonly: bool = False):
             pool_size=10,
             max_overflow=20,
         )
+        event.listen(self.engine, 'connect', lambda c, _: c.execute('pragma foreign_keys=on'))
         self.session_cls = scoped_session(sessionmaker(autoflush=False, bind=self.engine))
         self._upgraded = None
 
diff --git a/aim/storage/structured/sql_engine/entities.py b/aim/storage/structured/sql_engine/entities.py
index 554d4c70d..21d08626f 100644
--- a/aim/storage/structured/sql_engine/entities.py
+++ b/aim/storage/structured/sql_engine/entities.py
@@ -87,11 +87,9 @@ def from_hash(cls, runhash: str, created_at, session) -> 'ModelMappedRun':
 
     @classmethod
     def delete_run(cls, runhash: str, session) -> bool:
-        try:
-            rows_affected = session.query(RunModel).filter(RunModel.hash == runhash).delete()
-            session_commit_or_flush(session)
-        except Exception:
-            return False
+        rows_affected = session.query(RunModel).filter(RunModel.hash == runhash).delete()
+        session_commit_or_flush(session)
+
         return rows_affected > 0
 
     @classmethod
diff --git a/aim/storage/structured/sql_engine/models.py b/aim/storage/structured/sql_engine/models.py
index 1c78c539e..9859a8d85 100644
--- a/aim/storage/structured/sql_engine/models.py
+++ b/aim/storage/structured/sql_engine/models.py
@@ -28,7 +28,7 @@ def default_to_run_hash(context):
 run_tags = Table(
     'run_tag',
     Base.metadata,
-    Column('run_id', Integer, ForeignKey('run.id'), primary_key=True, nullable=False),
+    Column('run_id', Integer, ForeignKey('run.id', ondelete='CASCADE'), primary_key=True, nullable=False),
     Column('tag_id', Integer, ForeignKey('tag.id'), primary_key=True, nullable=False),
 )
 
@@ -51,7 +51,9 @@ class Run(Base):
     experiment_id = Column(ForeignKey('experiment.id'), nullable=True)
 
     experiment = relationship('Experiment', backref=backref('runs', uselist=True, order_by='Run.created_at.desc()'))
-    tags = relationship('Tag', secondary=run_tags, backref=backref('runs', uselist=True))
+    tags = relationship(
+        'Tag', secondary=run_tags, backref=backref('runs', uselist=True), cascade='all, delete', passive_deletes=True
+    )
     notes = relationship('Note', back_populates='run')
 
     def __init__(self, run_hash, created_at=None):
@@ -106,7 +108,7 @@ class Note(Base):
 
     id = Column(Integer, autoincrement=True, primary_key=True)
     content = Column(Text, nullable=False, default='')
-    run_id = Column(Integer, ForeignKey('run.id'))
+    run_id = Column(Integer, ForeignKey('run.id', ondelete='CASCADE'),)
     experiment_id = Column(Integer, ForeignKey('experiment.id'))
 
     created_at = Column(DateTime, default=datetime.datetime.utcnow)