mozilla
diff --git a/‎Dockerfile‎
Lines changed: 5 additions & 5 deletions b/‎Dockerfile‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎environment.yml‎
Lines changed: 1 addition & 0 deletions b/‎environment.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎moztelemetry/standards.py‎
Lines changed: 62 additions & 0 deletions b/‎moztelemetry/standards.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 17 additions & 11 deletions b/‎tests/conftest.py‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=1/data1.snappy.parquet‎
927 Bytes b/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=1/data1.snappy.parquet‎
927 Bytes
diff --git a/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=2/data11.snappy.parquet‎
927 Bytes b/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=2/data11.snappy.parquet‎
927 Bytes
diff --git a/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=3/data21.snappy.parquet‎
927 Bytes b/‎tests/data/sample_main_summary/v1/submission_date_s3=20170101/sample_id=3/data21.snappy.parquet‎
927 Bytes
diff --git a/‎tests/data/sample_main_summary/v1/submission_date_s3=20170102/sample_id=1/data31.snappy.parquet‎
1.26 KB b/‎tests/data/sample_main_summary/v1/submission_date_s3=20170102/sample_id=1/data31.snappy.parquet‎
1.26 KB
diff --git a/‎tests/data/sample_main_summary/v1/submission_date_s3=20170102/sample_id=2/data41.snappy.parquet‎
1.25 KB b/‎tests/data/sample_main_summary/v1/submission_date_s3=20170102/sample_id=2/data41.snappy.parquet‎
1.25 KB
@@ -8,20 +8,20 @@ ENV HBASE_VERSION=1.2.3
 RUN apt-get update && apt-get install -y g++ libpython-dev libsnappy-dev
 
 # setup conda environment
-RUN wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+
+# temporary workaround, pin miniconda version until it's fixed.
+RUN wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh
 RUN bash miniconda.sh -b -p /miniconda
 ENV PATH="/miniconda/bin:${PATH}"
 RUN hash -r
 RUN conda config --set always_yes yes --set changeps1 no
-RUN conda update -q conda
+# TODO: uncomment
+# RUN conda update -q conda
 RUN conda info -a # Useful for debugging any issues with conda
 
 # install spark/hbase
-RUN wget -nv https://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION-bin-hadoop2.6.tgz
 RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz
-RUN tar -zxf spark-$SPARK_VERSION-bin-hadoop2.6.tgz
 RUN tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
-ENV SPARK_HOME="/spark-${SPARK_VERSION}-bin-hadoop2.6"
 
 # build + activate conda environment
 COPY ./environment.yml /python_moztelemetry/
 
@@ -2,5 +2,6 @@ name: test-environment
 dependencies:
 - python=2.7
 - pandas
+- pyspark
 - python-snappy
 - snappy
@@ -136,3 +136,65 @@ def get_last_month_range():
     end_of_last_month = snap_to_beginning_of_month(today) - timedelta(days=1)
     start_of_last_month = snap_to_beginning_of_month(end_of_last_month)
     return (start_of_last_month, end_of_last_month)
+
+
+def read_main_summary(spark,
+                      submission_date_s3=None,
+                      sample_id=None,
+                      mergeSchema=True,
+                      path='s3://telemetry-parquet/main_summary/v4'):
+    """ Efficiently read main_summary parquet data.
+
+    Read data from the given path, optionally filtering to a
+    specified set of partition values first. This can save a
+    time, particularly if `mergeSchema` is True.
+
+    Args:
+        spark: Spark context
+        submission_date_s3: Optional list of values to filter the
+            `submission_date_s3` partition. Default is to read all
+            partitions.
+        sample_id: Optional list of values to filter the `sample_id`
+            partition. Default is to read all partitions.
+        mergeSchema (bool): Determines whether or not to merge the
+            schemas of the resulting parquet files (ie. whether to
+            support schema evolution or not). Default is to merge
+            schemas.
+        path (str): Location (disk or S3) from which to read data.
+            Default is to read from the "production" location on S3.
+
+    Returns:
+        A DataFrame loaded from the specified partitions.
+
+    """
+    base_path = path
+
+    # Specifying basePath retains the partition fields even
+    # if we read a bunch of paths separately.
+    reader = spark.read.option("basePath", base_path)
+    if mergeSchema:
+        reader = reader.option("mergeSchema", "true")
+
+    if submission_date_s3 is not None and sample_id is None:
+        paths = ["{}/submission_date_s3={}/".format(base_path, s) for s in submission_date_s3]
+        return reader.parquet(*paths)
+
+    if submission_date_s3 is not None and sample_id is not None:
+        paths = []
+        for sd in submission_date_s3:
+            for si in sample_id:
+                paths.append("{}/submission_date_s3={}/sample_id={}/".format(
+                    base_path, sd, si))
+        return reader.parquet(*paths)
+
+    if submission_date_s3 is None and sample_id is not None:
+        # Ugh, why? We would have to iterate the entire path to identify
+        # all the submission_date_s3 partitions, which may end up being
+        # slower.
+        data = reader.parquet(base_path)
+        sids = ["{}".format(s) for s in sample_id]
+        criteria = "sample_id IN ({})".format(",".join(sids))
+        return data.where(criteria)
+
+    # Neither partition is filtered.
+    return reader.parquet(base_path)
@@ -17,7 +17,7 @@
     package_dir={'moztelemetry': 'moztelemetry'},
     install_requires=['boto', 'boto3', 'ujson', 'requests', 'protobuf',
                       'expiringdict', 'functools32', 'futures', 'py4j',
-                      'pandas>=0.14.1', 'numpy>=1.8.2', 'findspark',
+                      'pandas>=0.14.1', 'numpy>=1.8.2',
                       'happybase>=1.1.0', 'PyYAML', 'python-snappy'],
     setup_requires=['pytest-runner', 'setuptools_scm'],
     # put pytest last to workaround this bug
 
@@ -8,10 +8,7 @@
 import pytest
 from moto import mock_s3
 from concurrent import futures
-
-import findspark
-findspark.init()
-import pyspark  # noqa
+from pyspark.sql import SparkSession
 
 
 @pytest.fixture
@@ -48,17 +45,26 @@ def dummy_bucket(my_mock_s3):
     return bucket
 
 
-@pytest.fixture(scope='session')
-def spark_context(request):
+@pytest.fixture(scope="session")
+def spark():
+    spark = (
+        SparkSession
+        .builder
+        .master("local")
+        .appName("python_moztelemetry_test")
+        .getOrCreate()
+    )
+
     logger = logging.getLogger("py4j")
     logger.setLevel(logging.ERROR)
-    sc = pyspark.SparkContext(master="local[1]")
 
-    def finalizer():
-        return sc.stop()
-    request.addfinalizer(finalizer)
+    yield spark
+    spark.stop()
 
-    return sc
+
+@pytest.fixture
+def spark_context(spark):
+    return spark.sparkContext
 
 
 @pytest.fixture(scope='session')