diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..95b6eb6 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,6 @@ +========== +Developers +========== + +* Kay Zhu (a.k.a He Zhu) +* Hobson Lane diff --git a/CHANGES.rst b/CHANGES.rst index d73c038..dad875a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,7 @@ +v0.0.8 2017/04/28 -- `$ pip install lshash3`, `>>> import lshash.lshash.LSHash` because it's Python3 compatible! +v0.0.7 2017/04/28 -- Package name is now lshash3 because it's Python3 compatible! Demo hyperspheres in tests. +v0.0.6 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py but still fails for an Ubuntu python3 virtualenv +v0.0.5 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py v0.0.3, 2012/12/28 -- Doc fixes. v0.0.2, 2012/12/28 -- Doc fixes and lowercase package name. v0.0.1, 2012/12/20 -- Initial release. diff --git a/lshash/__init__.py b/lshash/__init__.py index e805896..33a96e6 100644 --- a/lshash/__init__.py +++ b/lshash/__init__.py @@ -1,12 +1,6 @@ -# lshash/__init__.py -# Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt) -# -# This module is part of lshash and is released under -# the MIT License: http://www.opensource.org/licenses/mit-license.php +import pkg_resources -__title__ = 'lshash' -__author__ = 'Kay Zhu (me@kayzhu.com)' -__license__ = 'MIT' -__version__ = '0.0.4dev' - -from lshash import LSHash +try: + __version__ = pkg_resources.get_distribution(__name__).version +except: + __version__ = '0.0.4dev' diff --git a/lshash/lshash.py b/lshash/lshash.py index 5c895a6..e72254b 100644 --- a/lshash/lshash.py +++ b/lshash/lshash.py @@ -3,12 +3,26 @@ # # This module is part of lshash and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php +# -*- coding: utf-8 -*- +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import int, round, str, object # noqa +from future import standard_library +standard_library.install_aliases() # noqa: Counter, OrderedDict, +from past.builtins import basestring # noqa: + +import future # noqa +import builtins # noqa +import past # noqa +import six # noqa import os import json import numpy as np -from storage import storage +try: + from storage import storage # py2 +except ImportError: + from .storage import storage # py3 try: from bitarray import bitarray @@ -16,6 +30,12 @@ bitarray = None +try: + xrange # py2 +except NameError: + xrange = range # py3 + + class LSHash(object): """ LSHash implments locality sensitive hashing using random projection for input vectors of dimension `input_dim`. @@ -263,7 +283,7 @@ def query(self, query_point, num_results=None, distance_func=None): # rank candidates by distance function candidates = [(ix, d_func(query_point, self._as_np_array(ix))) for ix in candidates] - candidates.sort(key=lambda x: x[1]) + candidates = sorted(candidates, key=lambda x: x[1]) return candidates[:num_results] if num_results else candidates @@ -298,4 +318,4 @@ def l1norm_dist(x, y): @staticmethod def cosine_dist(x, y): - return 1 - np.dot(x, y) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5) + return 1 - float(np.dot(x, y)) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..39c78fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +bitarray +numpy +# redis diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..98a1dc4 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,86 @@ +[metadata] + +name = lshash3 +summary = A fast Python implementation of locality sensitive hashing with persistance (Redis) support. +author = Kay Zhu +author-email = me@kayzhu.com +license = MIT +home-page = http://... +description-file = README.rst +# Add here all kinds of additional classifiers as defined under +# https://pypi.python.org/pypi?%3Aaction=list_classifiers +classifier = + Development Status :: 4 - Beta + Programming Language :: Python + Programming Language :: Python :: 2 + Programming Language :: Python :: 3 + Intended Audience :: Developers + License :: OSI Approved :: MIT License + Operating System :: OS Independent + Topic :: Software Development :: Libraries + +[entry_points] +# Add here console scripts like: +# console_scripts = +# script_name = lshash.module:function +# For example: +# console_scripts = +# fibonacci = lshash.skeleton:run +# as well as other entry_points. + + +[files] +# Add here 'data_files', 'packages' or 'namespace_packages'. +# Additional data files are defined as key value pairs of source and target: +packages = + lshash + tests +# data_files = +# share/lshash_docs = docs/* + +[extras] +# Add here additional requirements for extra features, like: +Redis = + redis>=2.10.0 +Hamming = + bitarray + +[test] +# py.test options when running `python setup.py test` +addopts = tests + +[pytest] +# Options for py.test: +# Specify command line options as you would do when invoking py.test directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +addopts = + --cov lshash --cov-report term-missing + --verbose + +[aliases] +docs = build_sphinx + +[bdist_wheel] +# Use this option if your package is pure-python +universal = 1 + +[build_sphinx] +source_dir = docs +build_dir = docs/_build + +[pbr] +# Let pbr run sphinx-apidoc +autodoc_tree_index_modules = True +# autodoc_tree_excludes = ... +# Let pbr itself generate the apidoc +# autodoc_index_modules = True +# autodoc_exclude_modules = ... +# Convert warnings to errors +# warnerrors = True + +[devpi:upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm +no-vcs = 1 +formats = bdist_wheel diff --git a/setup.py b/setup.py index b0a98e5..6765d73 100644 --- a/setup.py +++ b/setup.py @@ -1,42 +1,19 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- +""" Setup file for lshash. + This file was generated with PyScaffold 2.5.6 +""" -import lshash +import sys +from setuptools import setup -try: - from setuptools import setup -except ImportError: - from distutils.core import setup +def setup_package(): + needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv) + sphinx = ['sphinx'] if needs_sphinx else [] + setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx, + use_pyscaffold=True) -with open('README.rst') as f: - readme = f.read() -with open('LICENSE') as f: - license = f.read() - -with open('CHANGES.rst') as f: - changes = f.read() - -required = ['numpy'] - -setup( - name='lshash', - version=lshash.__version__, - packages=['lshash'], - author='Kay Zhu', - author_email='me@kayzhu.com', - maintainer='Kay Zhu', - maintainer_email='me@kayzhu.com', - description='A fast Python implementation of locality sensitive hashing with persistance support.', - long_description=readme + '\n\n' + changes, - license=license, - requires=required, - classifiers=[ - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Topic :: Software Development :: Libraries', - ], -) +if __name__ == "__main__": + setup_package() diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..468f195 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,5 @@ +# Add requirements only needed for your unittests and during development here. +# They will be installed automatically when running `python setup.py test`. +# ATTENTION: Don't remove pytest-cov and pytest as they are needed. +pytest-cov +pytest diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..aab09f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Dummy conftest.py for lshash. + + If you don't know what this is for, just leave it empty. + Read more about conftest.py under: + https://pytest.org/latest/plugins.html +""" +from __future__ import print_function, absolute_import, division + +import pytest diff --git a/tests/test_spheres.py b/tests/test_spheres.py new file mode 100644 index 0000000..9421662 --- /dev/null +++ b/tests/test_spheres.py @@ -0,0 +1,71 @@ +#!usr/bin/env python3 +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import int, round, str, object # noqa +from future import standard_library +standard_library.install_aliases() # noqa: Counter, OrderedDict, +from past.builtins import basestring # noqa: + +import numpy as np + +import future # noqa +import builtins # noqa +import past # noqa +import six # noqa +from lshash.lshash import LSHash + +__author__ = "Hobson Lane" +__copyright__ = "Kay Zhu (a.k.a He Zhu)" +__license__ = "MIT" + + +def test_sphere(): + X = np.random.normal(size=(1000, 3)) + lsh = LSHash(10, 3, num_hashtables=5) + for x in X: + x /= np.linalg.norm(x) + lsh.index(x) + closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine") + assert len(closest) >= 10 + assert 0.05 >= closest[9][-1] > 0.0003 + + +def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))): + """ Demonstrate curse of dimensionality and where LSH starts to fail + + Returns: + lsh, X, secondclosest, tenthclosest + + >>> import pandas as pd + >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres() + >>> pd.DataFrame(rank2) + >>> pd.DataFrame(rank10) + """ + tenthclosest = [] + secondclosest = [] + closest = [] + for D in range(2, X.shape[1]): + lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D) + + # query vector + q = np.random.uniform(size=(D,)) + q /= np.linalg.norm(q) + + distances = [] + for x in X[:, :D]: + lsh.index(x) + x /= np.linalg.norm(x) + distances += [1. - np.sum(x * q)] # cosine similarity + distances = sorted(distances) + print(distances[:10]) + closest10 = lsh.query(q, distance_func='cosine') + + N = len(closest10) + tenthclosest += [[D, min(9, N - 1) if N else -1, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]] + secondclosest += [[D, min(1, N - 1) if N else -1, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]] + closest += [[D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0]]] + print(tenthclosest[-1]) + print(secondclosest[-1]) + print(closest[-1]) + # for i, tc in enumerate(tenthclosest): + # assert 1e-9 < tc[-2] or 1e-6 < 0.2 + return lsh, X, closest, secondclosest, tenthclosest