Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
==========
Developers
==========

* Kay Zhu (a.k.a He Zhu) <me@kayzhu.com>
* Hobson Lane <github+lshash3@totalgood.com>
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.0.8 2017/04/28 -- `$ pip install lshash3`, `>>> import lshash.lshash.LSHash` because it's Python3 compatible!
v0.0.7 2017/04/28 -- Package name is now lshash3 because it's Python3 compatible! Demo hyperspheres in tests.
v0.0.6 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py but still fails for an Ubuntu python3 virtualenv
v0.0.5 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py
v0.0.3, 2012/12/28 -- Doc fixes.
v0.0.2, 2012/12/28 -- Doc fixes and lowercase package name.
v0.0.1, 2012/12/20 -- Initial release.
16 changes: 5 additions & 11 deletions lshash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
# lshash/__init__.py
# Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt)
#
# This module is part of lshash and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
import pkg_resources

__title__ = 'lshash'
__author__ = 'Kay Zhu (me@kayzhu.com)'
__license__ = 'MIT'
__version__ = '0.0.4dev'

from lshash import LSHash
try:
__version__ = pkg_resources.get_distribution(__name__).version
except:
__version__ = '0.0.4dev'
26 changes: 23 additions & 3 deletions lshash/lshash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,39 @@
#
# This module is part of lshash and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals, division, absolute_import
from builtins import int, round, str, object # noqa
from future import standard_library
standard_library.install_aliases() # noqa: Counter, OrderedDict,
from past.builtins import basestring # noqa:

import future # noqa
import builtins # noqa
import past # noqa
import six # noqa

import os
import json
import numpy as np

from storage import storage
try:
from storage import storage # py2
except ImportError:
from .storage import storage # py3

try:
from bitarray import bitarray
except ImportError:
bitarray = None


try:
xrange # py2
except NameError:
xrange = range # py3


class LSHash(object):
""" LSHash implments locality sensitive hashing using random projection for
input vectors of dimension `input_dim`.
Expand Down Expand Up @@ -263,7 +283,7 @@ def query(self, query_point, num_results=None, distance_func=None):
# rank candidates by distance function
candidates = [(ix, d_func(query_point, self._as_np_array(ix)))
for ix in candidates]
candidates.sort(key=lambda x: x[1])
candidates = sorted(candidates, key=lambda x: x[1])

return candidates[:num_results] if num_results else candidates

Expand Down Expand Up @@ -298,4 +318,4 @@ def l1norm_dist(x, y):

@staticmethod
def cosine_dist(x, y):
return 1 - np.dot(x, y) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5)
return 1 - float(np.dot(x, y)) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
bitarray
numpy
# redis
86 changes: 86 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
[metadata]

name = lshash3
summary = A fast Python implementation of locality sensitive hashing with persistance (Redis) support.
author = Kay Zhu
author-email = me@kayzhu.com
license = MIT
home-page = http://...
description-file = README.rst
# Add here all kinds of additional classifiers as defined under
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifier =
Development Status :: 4 - Beta
Programming Language :: Python
Programming Language :: Python :: 2
Programming Language :: Python :: 3
Intended Audience :: Developers
License :: OSI Approved :: MIT License
Operating System :: OS Independent
Topic :: Software Development :: Libraries

[entry_points]
# Add here console scripts like:
# console_scripts =
# script_name = lshash.module:function
# For example:
# console_scripts =
# fibonacci = lshash.skeleton:run
# as well as other entry_points.


[files]
# Add here 'data_files', 'packages' or 'namespace_packages'.
# Additional data files are defined as key value pairs of source and target:
packages =
lshash
tests
# data_files =
# share/lshash_docs = docs/*

[extras]
# Add here additional requirements for extra features, like:
Redis =
redis>=2.10.0
Hamming =
bitarray

[test]
# py.test options when running `python setup.py test`
addopts = tests

[pytest]
# Options for py.test:
# Specify command line options as you would do when invoking py.test directly.
# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
# in order to write a coverage file that can be read by Jenkins.
addopts =
--cov lshash --cov-report term-missing
--verbose

[aliases]
docs = build_sphinx

[bdist_wheel]
# Use this option if your package is pure-python
universal = 1

[build_sphinx]
source_dir = docs
build_dir = docs/_build

[pbr]
# Let pbr run sphinx-apidoc
autodoc_tree_index_modules = True
# autodoc_tree_excludes = ...
# Let pbr itself generate the apidoc
# autodoc_index_modules = True
# autodoc_exclude_modules = ...
# Convert warnings to errors
# warnerrors = True

[devpi:upload]
# Options for the devpi: PyPI server and packaging tool
# VCS export must be deactivated since we are using setuptools-scm
no-vcs = 1
formats = bdist_wheel
49 changes: 13 additions & 36 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Setup file for lshash.
This file was generated with PyScaffold 2.5.6
"""

import lshash
import sys
from setuptools import setup

try:
from setuptools import setup
except ImportError:
from distutils.core import setup

def setup_package():
needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv)
sphinx = ['sphinx'] if needs_sphinx else []
setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx,
use_pyscaffold=True)

with open('README.rst') as f:
readme = f.read()

with open('LICENSE') as f:
license = f.read()

with open('CHANGES.rst') as f:
changes = f.read()

required = ['numpy']

setup(
name='lshash',
version=lshash.__version__,
packages=['lshash'],
author='Kay Zhu',
author_email='me@kayzhu.com',
maintainer='Kay Zhu',
maintainer_email='me@kayzhu.com',
description='A fast Python implementation of locality sensitive hashing with persistance support.',
long_description=readme + '\n\n' + changes,
license=license,
requires=required,
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Topic :: Software Development :: Libraries',
],
)
if __name__ == "__main__":
setup_package()
5 changes: 5 additions & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Add requirements only needed for your unittests and during development here.
# They will be installed automatically when running `python setup.py test`.
# ATTENTION: Don't remove pytest-cov and pytest as they are needed.
pytest-cov
pytest
Empty file added tests/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Dummy conftest.py for lshash.

If you don't know what this is for, just leave it empty.
Read more about conftest.py under:
https://pytest.org/latest/plugins.html
"""
from __future__ import print_function, absolute_import, division

import pytest
71 changes: 71 additions & 0 deletions tests/test_spheres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!usr/bin/env python3
from __future__ import print_function, unicode_literals, division, absolute_import
from builtins import int, round, str, object # noqa
from future import standard_library
standard_library.install_aliases() # noqa: Counter, OrderedDict,
from past.builtins import basestring # noqa:

import numpy as np

import future # noqa
import builtins # noqa
import past # noqa
import six # noqa
from lshash.lshash import LSHash

__author__ = "Hobson Lane"
__copyright__ = "Kay Zhu (a.k.a He Zhu)"
__license__ = "MIT"


def test_sphere():
X = np.random.normal(size=(1000, 3))
lsh = LSHash(10, 3, num_hashtables=5)
for x in X:
x /= np.linalg.norm(x)
lsh.index(x)
closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine")
assert len(closest) >= 10
assert 0.05 >= closest[9][-1] > 0.0003


def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))):
""" Demonstrate curse of dimensionality and where LSH starts to fail

Returns:
lsh, X, secondclosest, tenthclosest

>>> import pandas as pd
>>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres()
>>> pd.DataFrame(rank2)
>>> pd.DataFrame(rank10)
"""
tenthclosest = []
secondclosest = []
closest = []
for D in range(2, X.shape[1]):
lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D)

# query vector
q = np.random.uniform(size=(D,))
q /= np.linalg.norm(q)

distances = []
for x in X[:, :D]:
lsh.index(x)
x /= np.linalg.norm(x)
distances += [1. - np.sum(x * q)] # cosine similarity
distances = sorted(distances)
print(distances[:10])
closest10 = lsh.query(q, distance_func='cosine')

N = len(closest10)
tenthclosest += [[D, min(9, N - 1) if N else -1, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]]
secondclosest += [[D, min(1, N - 1) if N else -1, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]]
closest += [[D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0]]]
print(tenthclosest[-1])
print(secondclosest[-1])
print(closest[-1])
# for i, tc in enumerate(tenthclosest):
# assert 1e-9 < tc[-2] or 1e-6 < 0.2
return lsh, X, closest, secondclosest, tenthclosest