diff --git a/README.md b/README.md index 2f4cc27..ed359d7 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Tabular Data Transformation * [Parse a datetime column into its components (year, month, etc.)](split_datetime_column.py) * [Convert a column of datetime strings into UNIX timestamps](convert_column_to_timestamp.py) * [Expand an SFrame column of type list/dict into multiple columns](sframe_unpack.py) +* [Find the mode of an SArray](sarray_mode.py) Graph Data Transformation --------------------------- diff --git a/sarray_mode.py b/sarray_mode.py new file mode 100644 index 0000000..e0df485 --- /dev/null +++ b/sarray_mode.py @@ -0,0 +1,69 @@ +import graphlab as gl + +def mode_sa(sa, single_mode=True): + """Return a mode of sa, or all modes if there are several. + + single_mode: whether to return a single mode or an SArray of all modes (default: True).""" + + if len(sa) == 0: + raise ValueError("Can't find mode(s) in empty SArray") + + sf = gl.SFrame({"value": sa}) + sf2 = sf.groupby("value", {"count": gl.aggregate.COUNT()}) + max_count_index = sf2["count"].argmax() + + if single_mode: + return sf2[max_count_index]["value"] + + else: + max_count = sf2[max_count_index]["count"] + return sf2[sf2["count"] == max_count]["value"] + + +# Create an SArray with two modes (most-common elements: 2 and 3) +sa = gl.SArray([1, 2, 2, 3, 3]) + +# Find one of the modes +single_mode = mode_sa(sa) # returns 2 + +# Find all modes +all_modes = mode_sa(sa, single_mode=False) +# Returns +# dtype: int +# Rows: 2 +# [2, 3] + + +# A faster (albeit maybe less accurate) way to find the mode value is using sa.sketch_summary().frequent_items() . +# There are two caveats to this approach: +# 1. won't work for very low-frequency mode values, and +# 2. won't necessarily give the correct result if there are multiple likely candidates. + +def sketch_mode_sa(sa, single_mode=True): + """Fast (albeit less accurate) way to find the mode value(s) of SArray sa. + + single_mode: whether to return a single mode or an SArray of all modes (default: True).""" + + if len(sa) == 0: + raise ValueError("Can't find mode(s) in empty SArray") + + frequent_items_sketch = sa.sketch_summary().frequent_items() + modes_sketch = [k for (k, v) in frequent_items_sketch.iteritems() + if v == max(frequent_items_sketch.itervalues())] + return modes_sketch[0] if single_mode else modes_sketch + +sketch_mode_sa(sa) # returns 2 +sketch_mode_sa(sa, single_mode=False) # returns [2, 3] + + +# Both approaches should handle empty SArrays. +# The implementations above will simply raise a ValueError if `sa` is empty. +try: + mode_sa(gl.SArray([])) +except ValueError: + pass + +try: + sketch_mode_sa(gl.SArray([])) +except ValueError: + pass