Skip to content
This repository was archived by the owner on Jan 22, 2026. It is now read-only.

Commit ce131ef

Browse files
authored
Bug 1306049 - Sanitize string args to "where" (#161)
Hide the implementation details about how dimensions are sanitized for S3 storage. This way you can do things like: dataset.where(docType="saved-session") instead of having to know that the hyphen is replaced with an underscore when data is saved (and used "saved_session").
1 parent b4d68d7 commit ce131ef

2 files changed

Lines changed: 37 additions & 6 deletions

File tree

moztelemetry/dataset.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import functools
77
import json
88
import random
9+
import re
910
import types
1011
from copy import copy
1112
from inspect import isfunction
@@ -19,6 +20,7 @@
1920
from .store import S3Store
2021

2122
MAX_CONCURRENCY = int(cpu_count() * 1.5)
23+
SANITIZE_PATTERN = re.compile("[^a-zA-Z0-9_.]")
2224

2325

2426
def _group_by_size_greedy(obj_list, tot_groups):
@@ -154,12 +156,27 @@ def _apply_selection(self, json_obj):
154156
return dict((name, path.search(json_obj))
155157
for name, path in self.selection_compiled.items())
156158

159+
def _sanitize_dimension(self, v):
160+
"""Sanitize the given string by replacing illegal characters
161+
with underscores.
162+
163+
For String conditions, we should pre-sanitize so that users of
164+
the `where` function do not need to know about the nuances of how
165+
S3 dimensions are sanitized during ingestion.
166+
167+
See https://github.com/mozilla-services/lua_sandbox_extensions/blob/master/moz_telemetry/io_modules/moz_telemetry/s3.lua#L167
168+
169+
:param v: a string value that should be sanitized.
170+
"""
171+
return re.sub(SANITIZE_PATTERN, "_", v)
172+
157173
def where(self, **kwargs):
158174
"""Return a new Dataset refined using the given condition
159175
160176
:param kwargs: a map of `dimension` => `condition` to filter the elements
161177
of the dataset. `condition` can either be an exact value or a
162-
callable returning a boolean value.
178+
callable returning a boolean value. If `condition` is a value, it is
179+
converted to a string, then sanitized.
163180
"""
164181
clauses = copy(self.clauses)
165182
for dimension, condition in kwargs.items():
@@ -170,7 +187,7 @@ def where(self, **kwargs):
170187
if isfunction(condition) or isinstance(condition, functools.partial):
171188
clauses[dimension] = condition
172189
else:
173-
clauses[dimension] = functools.partial((lambda x, y: x == y), condition)
190+
clauses[dimension] = functools.partial((lambda x, y: x == y), self._sanitize_dimension(str(condition)))
174191
return Dataset(self.bucket, self.schema, store=self.store,
175192
prefix=self.prefix, clauses=clauses, selection=self.selection)
176193

tests/test_dataset.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,11 @@ def test_apply_selection():
9191

9292
def test_where_exact_match():
9393
dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
94-
new_dataset = dataset.where(dim1='my-value')
94+
new_dataset = dataset.where(dim1='myvalue')
9595
assert new_dataset is not dataset
9696
assert new_dataset.clauses.keys() == ['dim1']
9797
condition = new_dataset.clauses['dim1']
98-
assert condition('my-value')
98+
assert condition('myvalue')
9999

100100

101101
def test_where_wrong_dimension():
@@ -169,11 +169,11 @@ def test_scan_multiple_where_params(spark_context):
169169

170170
def test_scan_multiple_params():
171171
dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/')
172-
new_dataset = dataset.where(dim1='my-value')
172+
new_dataset = dataset.where(dim1='myvalue')
173173
assert new_dataset is not dataset
174174
assert new_dataset.clauses.keys() == ['dim1']
175175
condition = new_dataset.clauses['dim1']
176-
assert condition('my-value')
176+
assert condition('myvalue')
177177

178178

179179
def test_summaries(spark_context):
@@ -396,3 +396,17 @@ def test_prefix_slash(spark_context):
396396
summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context)
397397
assert len(summaries_filtered) == 1
398398
assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'
399+
400+
401+
def test_sanitized_dimensions(spark_context):
402+
bucket_name = 'test-bucket'
403+
store = InMemoryStore(bucket_name)
404+
store.store['dir_1/subdir1/key1'] = 'value1'
405+
store.store['dir_1/subdir2/key2'] = 'value2'
406+
store.store['dir_2/subdir3/key3'] = 'value3'
407+
store.store['dir_3/subdir4/key4'] = 'value4'
408+
409+
dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1="dir-1")
410+
411+
summaries = dataset.summaries(spark_context)
412+
assert len(summaries) == 2

0 commit comments

Comments
 (0)