diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000..6ace61c3
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,61 @@
+version: 2
+
+
+shared: &shared
+ working_directory: ~/shorttext
+
+ steps:
+ - checkout
+
+ - run:
+ name: Apt Install
+ command: |
+ sudo apt-get update
+ sudo apt-get install libc6
+ sudo apt-get install python3-dev
+ sudo apt-get install -y g++
+
+ - run:
+ name: Installing Miniconda and Packages
+ command: |
+ pip install --upgrade --user pip
+ pip install --upgrade --user google-compute-engine
+ pip install --user .
+
+ - run:
+ name: Run Unit Tests
+ command: |
+ pip install --user .[test]
+ pytest
+
+
+jobs:
+ py39:
+ <<: *shared
+ docker:
+ - image: cimg/python:3.9
+
+ py310:
+ <<: *shared
+ docker:
+ - image: cimg/python:3.10
+
+ py311:
+ <<: *shared
+ docker:
+ - image: cimg/python:3.11
+
+ py312:
+ <<: *shared
+ docker:
+ - image: cimg/python:3.12
+
+
+workflows:
+ version: 2
+ build:
+ jobs:
+ - py39
+ - py310
+ - py311
+ - py312
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
new file mode 100644
index 00000000..4f63870e
--- /dev/null
+++ b/.github/workflows/publish-to-pypi.yml
@@ -0,0 +1,34 @@
+name: Publish to PyPI
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ publish-to-pypi:
+ name: Publish to PyPI
+ runs-on: ubuntu-latest
+ permissions:
+ id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+
+ - name: Install build dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build
+
+ - name: Build package
+ run: python -m build
+
+ - name: Publish package to PyPI
+ env:
+ TWINE_USERNAME: __token__
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+ run: twine upload dist/*
diff --git a/.gitignore b/.gitignore
index 2b50e0a7..e0b5b946 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,60 +1,5 @@
-
-# Created by https://www.gitignore.io/api/python,pycharm
-
-### PyCharm ###
-# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
-# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
-
-# User-specific stuff:
-.idea/**/workspace.xml
-.idea/**/tasks.xml
-
-# Sensitive or high-churn files:
-.idea/**/dataSources/
-.idea/**/dataSources.ids
-.idea/**/dataSources.xml
-.idea/**/dataSources.local.xml
-.idea/**/sqlDataSources.xml
-.idea/**/dynamic.xml
-.idea/**/uiDesigner.xml
-.idea/inspectionProfiles/*.xml
-
-# Gradle:
-.idea/**/gradle.xml
-.idea/**/libraries
-
-# Mongo Explorer plugin:
-.idea/**/mongoSettings.xml
-
-## File-based project format:
-*.iws
-
-## Plugin-specific files:
-
-# IntelliJ
-/out/
-
-# mpeltonen/sbt-idea plugin
-.idea_modules/
-
-# JIRA plugin
-atlassian-ide-plugin.xml
-
-# Crashlytics plugin (for Android Studio and IntelliJ)
-com_crashlytics_export_strings.xml
-crashlytics.properties
-crashlytics-build.properties
-fabric.properties
-
-### PyCharm Patch ###
-# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
-
-# *.iml
-# modules.xml
-# .idea/misc.xml
-# *.ipr
-
-### Python ###
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@@ -65,7 +10,6 @@ __pycache__/
# Distribution / packaging
.Python
-env/
build/
develop-eggs/
dist/
@@ -78,9 +22,12 @@ parts/
sdist/
var/
wheels/
+pip-wheel-metadata/
+share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
+MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
@@ -95,13 +42,17 @@ pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
+.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
-*,cover
+*.cover
+*.py,cover
.hypothesis/
+.pytest_cache/
+cover/
# Translations
*.mo
@@ -110,6 +61,8 @@ coverage.xml
# Django stuff:
*.log
local_settings.py
+db.sqlite3
+db.sqlite3-journal
# Flask stuff:
instance/
@@ -122,29 +75,840 @@ instance/
docs/_build/
# PyBuilder
+.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+
# pyenv
-.python-version
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
-# celery beat schedule file
+# Celery stuff
celerybeat-schedule
+celerybeat.pid
-# dotenv
-.env
+# SageMath parsed files
+*.sage.py
-# virtualenv
+# Environments
+.env
.venv
+env/
venv/
ENV/
+env.bak/
+venv.bak/
# Spyder project settings
.spyderproject
+.spyproject
# Rope project settings
.ropeproject
-# End of https://www.gitignore.io/api/python,pycharm
\ No newline at end of file
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+### Emacs template
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+
+### C template
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+
+### JupyterNotebooks template
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+*/.ipynb_checkpoints/*
+
+# IPython
+
+# Remove previous ipynb_checkpoints
+# git rm -r .ipynb_checkpoints/
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+.idea
+
+### C++ template
+# Prerequisites
+
+# Compiled Object files
+*.slo
+
+# Precompiled Headers
+
+# Compiled Dynamic libraries
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+
+# Executables
+
+### Linux template
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### ArchLinuxPackages template
+*.tar
+*.tar.*
+*.jar
+*.msi
+*.zip
+*.tgz
+*.log.*
+*.sig
+
+pkg/
+
+### Fortran template
+# Prerequisites
+
+# Compiled Object files
+
+# Precompiled Headers
+
+# Compiled Dynamic libraries
+
+# Fortran module files
+
+# Compiled Static libraries
+
+# Executables
+
+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### VisualStudio template
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.meta
+*.iobj
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*[.json, .xml, .info]
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+### CUDA template
+*.i
+*.ii
+*.gpu
+*.ptx
+*.cubin
+*.fatbin
+
+### Eclipse template
+.metadata
+bin/
+tmp/
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+.recommenders
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# PyDev specific (Python IDE for Eclipse)
+*.pydevproject
+
+# CDT-specific (C/C++ Development Tooling)
+.cproject
+
+# CDT- autotools
+.autotools
+
+# Java annotation processor (APT)
+.factorypath
+
+# PDT-specific (PHP Development Tools)
+.buildpath
+
+# sbteclipse plugin
+.target
+
+# Tern plugin
+.tern-project
+
+# TeXlipse plugin
+.texlipse
+
+# STS (Spring Tool Suite)
+.springBeans
+
+# Code Recommenders
+.recommenders/
+
+# Annotation Processing
+.apt_generated/
+.apt_generated_test/
+
+# Scala IDE specific (Scala & Java development for Eclipse)
+.cache-main
+.scala_dependencies
+.worksheet
+
+# Uncomment this line if you wish to ignore the project description file.
+# Typically, this file would be tracked if it contains build/dependency configurations:
+#.project
+
+### Windows template
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+### VisualStudioCode template
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+
+### Xcode template
+# Xcode
+#
+# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
+
+## User settings
+xcuserdata/
+
+## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9)
+*.xcscmblueprint
+*.xccheckout
+
+## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4)
+DerivedData/
+*.moved-aside
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+
+## Gcc Patch
+/*.gcno
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 00000000..da10187e
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,31 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+ configuration: docs/conf.py
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.12"
+
+# Build documentation with MkDocs
+#mkdocs:
+# configuration: mkdocs.yml
+
+# Optionally build your docs in additional formats such as PDF and ePub
+formats: all
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+ install:
+ - requirements: docs/requirements_minimal.txt
+
+# conda environment
+#conda:
+# environment: environment.yml
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 31ed28ef..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-language: python
-python:
- - "2.7"
-before_install:
- - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh
- - chmod +x miniconda.sh
- - ./miniconda.sh -b
- - export PATH=/home/travis/miniconda2/bin:$PATH
- - conda update --yes conda
-install:
- - conda create --yes -n shorttext-test python=$TRAVIS_PYTHON_VERSION pip numpy scipy
- - source activate shorttext-test
- - pip install unittest2
- - pip install pytest
- - pip install -U .
-script: python shorttext_tests.py
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..e2a3f566
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2016 Kwan Yuet Stephen Ho
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 17bc2ad2..4a6b9900 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,6 @@
include README.md
-include shorttext/data/shorttext_exampledata.csv
-include shorttext/utils/stopwordset.pkl
\ No newline at end of file
+include LICENSE
+include pyproject.toml
+include src/shorttext/data/shorttext_exampledata.csv
+include src/shorttext/utils/stopwords.txt
+include src/shorttext/utils/nonneg_stopwords.txt
diff --git a/README.md b/README.md
index 5a9d8661..c029523c 100644
--- a/README.md
+++ b/README.md
@@ -1,39 +1,153 @@
-# Short Text Categorization in Python
+# Short Text Mining in Python
-This repository is a collection of algorithms for multi-class classification to short texts using Python. Modules are backward compatible unless otherwise specified. Feel free to give suggestions.
+[](https://circleci.com/gh/stephenhky/PyShortTextCategorization.svg)
+[](https://github.com/stephenhky/PyShortTextCategorization/releases)
+[](https://pyqentangle.readthedocs.io/en/latest/?badge=latest)
+[](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/)
+[](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/)
+[](https://pypi.org/project/shorttext/)
+[](https://pypi.org/project/shorttext/)
+[](https://github.com/stephenhky/PyShortTextCategorization)
+
+## Introduction
+
+This package `shorttext` is a Python package that facilitates supervised and unsupervised
+learning for short text categorization. Due to the sparseness of words and
+the lack of information carried in the short texts themselves, an intermediate
+representation of the texts and documents are needed before they are put into
+any classification algorithm. In this package, it facilitates various types
+of these representations, including topic modeling and word-embedding algorithms.
+
+The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12.
+Characteristics:
+
+- example data provided (including subject keywords and NIH RePORT);
+- text preprocessing;
+- pre-trained word-embedding support;
+- `gensim` topic models (LDA, LSI, Random Projections) and autoencoder;
+- topic model representation supported for supervised learning using `scikit-learn`;
+- cosine distance classification;
+- neural network classification (including ConvNet, and C-LSTM);
+- maximum entropy classification;
+- metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD);
+- character-level sequence-to-sequence (seq2seq) learning; and
+- spell correction.
+
+## Documentation
+
+Documentation and tutorials for `shorttext` can be found here: [http://shorttext.rtfd.io/](http://shorttext.rtfd.io/).
+
+See [tutorial](http://shorttext.readthedocs.io/en/latest/tutorial.html) for how to use the package, and [FAQ](https://shorttext.readthedocs.io/en/latest/faq.html).
+
+## Installation
To install it, in a console, use `pip`.
```
->>> pip install -U shorttext
+>>> pip install shorttext
```
-or, if you want the most updated code that is not released on PyPI yet, type
+or, if you want the most recent development version on Github, type
```
->>> pip install -U git+https://github.com/stephenhky/PyShortTextCategorization@master
+>>> pip install git+https://github.com/stephenhky/PyShortTextCategorization@master
```
-Developers are advised to make sure `Keras` >=2 be installed. Users are advised to install the backend `Tensorflow` (preferred) or `Theano` in advance.
+See [installation guide](https://shorttext.readthedocs.io/en/latest/install.html) for more details.
-See [tutorial](http://pythonhosted.org/shorttext/tutorial.html) for how to use the package.
-# Issues
+## Issues
To report any issues, go to the [Issues](https://github.com/stephenhky/PyShortTextCategorization/issues) tab of the Github page and start a thread.
It is welcome for developers to submit pull requests on their own
to fix any errors.
-# Useful Links
+## Contributors
+
+If you would like to contribute, feel free to submit the pull requests to the `develop` branch.
+You can talk to me in advance through e-mails or the [Issues](https://github.com/stephenhky/PyShortTextCategorization/issues) page.
-* Documentation : [https://pythonhosted.org/shorttext/](https://pythonhosted.org/shorttext/)
+## Useful Links
+
+* Documentation: [http://shorttext.readthedocs.io](http://shorttext.readthedocs.io/)
* Github: [https://github.com/stephenhky/PyShortTextCategorization](https://github.com/stephenhky/PyShortTextCategorization)
-* PyPI: [https://pypi.python.org/pypi/shorttext](https://pypi.python.org/pypi/shorttext)
+* PyPI: [https://pypi.org/project/shorttext/](https://pypi.org/project/shorttext/)
+* "Package shorttext 1.0.0 released," [Medium](https://medium.com/@stephenhky/package-shorttext-1-0-0-released-ca3cb24d0ff3)
* "Python Package for Short Text Mining", [WordPress](https://datawarrior.wordpress.com/2016/12/22/python-package-for-short-text-mining/)
+* "Document-Term Matrix: Text Mining in R and Python," [WordPress](https://datawarrior.wordpress.com/2018/01/22/document-term-matrix-text-mining-in-r-and-python/)
* An [earlier version](https://github.com/stephenhky/PyShortTextCategorization/tree/b298d3ce7d06a9b4e0f7d32f27bab66064ba7afa) of this repository is a demonstration of the following blog post: [Short Text Categorization using Deep Neural Networks and Word-Embedding Models](https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/)
-# News
+## News
+
+* 10/27/2025: `shorttext` 3.0.1 released.
+* 08/10/2025: `shorttext` 3.0.0 released.
+* 06/02/2025: `shorttext` 2.2.1 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))
+* 05/29/2025: `shorttext` 2.2.0 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))
+* 05/08/2025: `shorttext` 2.1.1 released.
+* 12/14/2024: `shorttext` 2.1.0 released.
+* 07/12/2024: `shorttext` 2.0.0 released.
+* 12/21/2023: `shorttext` 1.6.1 released.
+* 08/26/2023: `shorttext` 1.6.0 released.
+* 06/19/2023: `shorttext` 1.5.9 released.
+* 09/23/2022: `shorttext` 1.5.8 released.
+* 09/22/2022: `shorttext` 1.5.7 released.
+* 08/29/2022: `shorttext` 1.5.6 released.
+* 05/28/2022: `shorttext` 1.5.5 released.
+* 12/15/2021: `shorttext` 1.5.4 released.
+* 07/11/2021: `shorttext` 1.5.3 released.
+* 07/06/2021: `shorttext` 1.5.2 released.
+* 04/10/2021: `shorttext` 1.5.1 released.
+* 04/09/2021: `shorttext` 1.5.0 released.
+* 02/11/2021: `shorttext` 1.4.8 released.
+* 01/11/2021: `shorttext` 1.4.7 released.
+* 01/03/2021: `shorttext` 1.4.6 released.
+* 12/28/2020: `shorttext` 1.4.5 released.
+* 12/24/2020: `shorttext` 1.4.4 released.
+* 11/10/2020: `shorttext` 1.4.3 released.
+* 10/18/2020: `shorttext` 1.4.2 released.
+* 09/23/2020: `shorttext` 1.4.1 released.
+* 09/02/2020: `shorttext` 1.4.0 released.
+* 07/23/2020: `shorttext` 1.3.0 released.
+* 06/05/2020: `shorttext` 1.2.6 released.
+* 05/20/2020: `shorttext` 1.2.5 released.
+* 05/13/2020: `shorttext` 1.2.4 released.
+* 04/28/2020: `shorttext` 1.2.3 released.
+* 04/07/2020: `shorttext` 1.2.2 released.
+* 03/23/2020: `shorttext` 1.2.1 released.
+* 03/21/2020: `shorttext` 1.2.0 released.
+* 12/01/2019: `shorttext` 1.1.6 released.
+* 09/24/2019: `shorttext` 1.1.5 released.
+* 07/20/2019: `shorttext` 1.1.4 released.
+* 07/07/2019: `shorttext` 1.1.3 released.
+* 06/05/2019: `shorttext` 1.1.2 released.
+* 04/23/2019: `shorttext` 1.1.1 released.
+* 03/03/2019: `shorttext` 1.1.0 released.
+* 02/14/2019: `shorttext` 1.0.8 released.
+* 01/30/2019: `shorttext` 1.0.7 released.
+* 01/29/2019: `shorttext` 1.0.6 released.
+* 01/13/2019: `shorttext` 1.0.5 released.
+* 10/03/2018: `shorttext` 1.0.4 released.
+* 08/06/2018: `shorttext` 1.0.3 released.
+* 07/24/2018: `shorttext` 1.0.2 released.
+* 07/17/2018: `shorttext` 1.0.1 released.
+* 07/14/2018: `shorttext` 1.0.0 released.
+* 06/18/2018: `shorttext` 0.7.2 released.
+* 05/30/2018: `shorttext` 0.7.1 released.
+* 05/17/2018: `shorttext` 0.7.0 released.
+* 02/27/2018: `shorttext` 0.6.0 released.
+* 01/19/2018: `shorttext` 0.5.11 released.
+* 01/15/2018: `shorttext` 0.5.10 released.
+* 12/14/2017: `shorttext` 0.5.9 released.
+* 11/08/2017: `shorttext` 0.5.8 released.
+* 10/27/2017: `shorttext` 0.5.7 released.
+* 10/17/2017: `shorttext` 0.5.6 released.
+* 09/28/2017: `shorttext` 0.5.5 released.
+* 09/08/2017: `shorttext` 0.5.4 released.
+* 09/02/2017: end of GSoC project. ([Report](https://rare-technologies.com/chinmayas-gsoc-2017-summary-integration-with-sklearn-keras-and-implementing-fasttext/))
+* 08/22/2017: `shorttext` 0.5.1 released.
+* 07/28/2017: `shorttext` 0.4.1 released.
+* 07/26/2017: `shorttext` 0.4.0 released.
* 06/16/2017: `shorttext` 0.3.8 released.
* 06/12/2017: `shorttext` 0.3.7 released.
* 06/02/2017: `shorttext` 0.3.6 released.
@@ -48,6 +162,7 @@ to fix any errors.
* 11/25/2016: `shorttext` 0.1.2 released.
* 11/21/2016: `shorttext` 0.1.1 released.
-# Possible Future Updates
+# Acknowledgements
-Refer to [UPCOMING.md](UPCOMING.md).
+* [Chinmaya Pancholi](https://www.linkedin.com/in/cpancholi/)
+* [Minseo Kim](https://kmingseo.github.io/)
diff --git a/UPCOMING.md b/UPCOMING.md
deleted file mode 100644
index 1e115263..00000000
--- a/UPCOMING.md
+++ /dev/null
@@ -1,22 +0,0 @@
-Upcoming Updates to `shorttext`
-===============================
-
-Confirmed Updates
------------------
-
-* Maximum entropy models;
-* Use of `gensim` Word2Vec `keras` layers.
-
-Expected Updates
-----------------
-
-* Incorporating new features from `gensim`;
-* Implementation of author-topic model;
-* Python 3 compatibility;
-* More neural networks;
-* More available corpus;
-* Generative models;
-* Support of seq2seq models;
-* Gradual fading-out dependence on `Theano`, and lesser `keras` but more fundamental `Tensorflow`;
-* Spelling corrections and fuzzy logic;
-* Other word-embedding models.
\ No newline at end of file
diff --git a/bin/ShortTextCategorizerConsole b/bin/ShortTextCategorizerConsole
deleted file mode 100644
index 93157b81..00000000
--- a/bin/ShortTextCategorizerConsole
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python
-
-# argument parsing
-import argparse
-
-def get_argparser():
- argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.')
- argparser.add_argument('model_filepath', help='Path of the trained (compact) model.')
- argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)')
- argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)')
- return argparser
-
-argparser = get_argparser()
-args = argparser.parse_args()
-
-allowed_classifiers = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn',
- 'nnlibvec', 'sumvec', 'maxent']
-needembedded_classifiers = ['nnlibvec', 'sumvec']
-topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder']
-
-# library loading
-import os
-
-import shorttext
-from shorttext.utils.classification_exceptions import Word2VecModelNotExistException, AlgorithmNotExistException
-
-# main block
-if __name__ == '__main__':
- # check if the model file is given
- if not os.path.exists(args.model_filepath):
- raise IOError('Model file '+args.model_filepath+' not found!')
-
- # get the name of the classifier
- print 'Retrieving classifier name...'
- classifier_name = shorttext.utils.compactmodel_io.get_model_classifier_name(args.model_filepath)
- if not (classifier_name in allowed_classifiers):
- raise AlgorithmNotExistException(classifier_name)
-
- # load the Word2Vec model if necessary
- wvmodel = None
- if classifier_name in needembedded_classifiers:
- # check if thw word embedding model is available
- if not os.path.exists(args.wv):
- raise Word2VecModelNotExistException(args.wv)
- # if there, load it
- print 'Loading word-embedding model...', args.wv
- wvmodel = shorttext.utils.load_word2vec_model(args.wv)
-
- # load the classifier
- print 'Initializing the classifier...'
- classifier = None
- if classifier_name in topicmodels:
- topicmodel = shorttext.smartload_compact_model(args.model_filepath, wvmodel)
- classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodel)
- else:
- classifier = shorttext.smartload_compact_model(args.model_filepath, wvmodel)
-
- # Initializing the SpaCy kernel
- shorttext.utils.textpreprocessing.spaCyNLPHolder.getNLPInstance()
-
- # Console
- run = True
- while run:
- shorttext = raw_input('text> ')
- if len(shorttext) > 0:
- scoredict = classifier.score(shorttext)
- for label, score in sorted(scoredict.items(), key=lambda s: s[1], reverse=True)[:args.topn]:
- print label, ' : ', score
- else:
- run = False
-
- print 'Done.'
\ No newline at end of file
diff --git a/bin/ShortTextWord2VecSimilarity b/bin/ShortTextWord2VecSimilarity
deleted file mode 100644
index 4ba6a80f..00000000
--- a/bin/ShortTextWord2VecSimilarity
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-
-# argument parsing
-import argparse
-
-def getargparser():
- parser = argparse.ArgumentParser(description='Find the similarity between two short sentences using Word2Vec.')
- parser.add_argument('word2vec_modelpath', help='Path of the Word2Vec model')
- return parser
-
-parser = getargparser()
-args = parser.parse_args()
-
-import shorttext
-from shorttext.utils import tokenize
-from scipy.spatial import distance
-from itertools import product
-import numpy as np
-
-class ShortSentenceWord2VecSimilarity:
- def __init__(self, modelpath):
- self.model = shorttext.utils.load_word2vec_model(modelpath) if modelpath!=None else None
-
- def sim_words(self, word1, word2):
- return 1-distance.cosine(self.model[word1], self.model[word2])
-
- def jaccardscore_sents(self, sent1, sent2):
- tokens1 = tokenize(sent1)
- tokens2 = tokenize(sent2)
- tokens1 = filter(lambda w: w in self.model, tokens1)
- tokens2 = filter(lambda w: w in self.model, tokens2)
- allowable1 = [True]*len(tokens1)
- allowable2 = [True]*len(tokens2)
-
- simdict = dict()
- for i, j in product(range(len(tokens1)), range(len(tokens2))):
- simdict[(i, j)] = self.sim_words(tokens1[i], tokens2[j])
-
- intersection = 0.0
- simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True)
- for idxtuple, sim in simdictitems:
- i, j = idxtuple
- if allowable1[i] and allowable2[j]:
- intersection += sim
- allowable1[i] = False
- allowable2[j] = False
-
- union = len(tokens1) + len(tokens2) - intersection
-
- if union > 0:
- return intersection / union
- elif intersection == 0:
- return 1.
- else:
- return np.inf
-
-if __name__ == '__main__':
- calculator = ShortSentenceWord2VecSimilarity(args.word2vec_modelpath)
- end = False
- # preload tokenizer
- tokenize('Mogu is cute.')
- while not end:
- sent1 = raw_input('sent1> ')
- if len(sent1)==0:
- end = True
- else:
- sent2 = raw_input('sent2> ')
- print "Word2Vec Jaccard Score Similarity = ", calculator.jaccardscore_sents(sent1, sent2)
\ No newline at end of file
diff --git a/bin/switch_kerasbackend b/bin/switch_kerasbackend
deleted file mode 100644
index 9372986c..00000000
--- a/bin/switch_kerasbackend
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python
-
-# Secret code. Welcome for those who find this code.
-
-# argument parsing
-import argparse
-
-def getargparser():
- parser = argparse.ArgumentParser(description='Switch Keras backend')
- parser.add_argument('backend', help="Backend ('theano' or 'tensorflow')")
- return parser
-
-parser = getargparser()
-args = parser.parse_args()
-
-import os
-import json
-
-homedir = os.path.expanduser('~')
-kerasconfigfile = os.path.join(homedir, '.keras/keras.json')
-
-if __name__ == '__main__':
- kerasconfig = json.load(open(kerasconfigfile, 'r'))
- kerasconfig['backend'] = args.backend
- json.dump(kerasconfig, open(kerasconfigfile, 'w'))
- print 'Keras backend set to', args.backend
\ No newline at end of file
diff --git a/data/USInaugural.zip b/data/USInaugural.zip
deleted file mode 100644
index eb19e79f..00000000
Binary files a/data/USInaugural.zip and /dev/null differ
diff --git a/data/nih_full.csv.zip b/data/nih_full.csv.zip
deleted file mode 100644
index 96158d7c..00000000
Binary files a/data/nih_full.csv.zip and /dev/null differ
diff --git a/docs/codes.rst b/docs/codes.rst
index e57e5921..b86ecd42 100644
--- a/docs/codes.rst
+++ b/docs/codes.rst
@@ -1,87 +1,30 @@
-Documentation
-=============
+API
+===
-Training Data Retrieval
------------------------
+API unlisted in tutorials are listed here.
-Module `shorttext.data.data_retrieval`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.data.data_retrieval
- :members:
-
-Text Preprocessing
-------------------
-
-Module `shorttext.utils.textpreprocessing`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.utils.textpreprocessing
- :members:
-
-Topic Models
-------------
-
-Module `shorttext.generators.bow.LatentTopicModeling`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.generators.bow.LatentTopicModeling
- :members:
-
-Module `shorttext.generators.bow.GensimTopicModeling`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.generators.bow.GensimTopicModeling
- :members:
-
-Module `shorttext.generators.bow.AutoEncodingTopicModeling`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Shorttext Models Smart Loading
+------------------------------
-.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling
- :members:
-
-
-Module `shorttext.classifiers.topic.TopicVectorDistanceClassification`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification
- :members:
-
-Module `shorttext.classifiers.topic.SkLearnClassification`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification
+.. automodule:: shorttext.smartload
:members:
Supervised Classification using Word Embedding
----------------------------------------------
-Module `shorttext.classifiers.embed.sumvec.SumEmbedVecClassification`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Module `shorttext.generators.seq2seq.s2skeras`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. automodule:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification
+.. automodule:: shorttext.generators.seq2seq.s2skeras
:members:
+
Module `shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification
:members:
-Module `shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification
- :members:
-
-Maximum Entropy Classifiers
----------------------------
-
-Module `shorttext.classifiers.bow.maxent.MaxEntClassification
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification
- :members:
Neural Networks
---------------
@@ -92,11 +35,6 @@ Module `shorttext.classifiers.embed.sumvec.frameworks`
.. automodule:: shorttext.classifiers.embed.sumvec.frameworks
:members:
-Module `shorttext.classifiers.embed.nnlib.frameworks`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. automodule:: shorttext.classifiers.embed.nnlib.frameworks
- :members:
Utilities
---------
@@ -113,25 +51,46 @@ Module `shorttext.utils.gensim_corpora`
.. automodule:: shorttext.utils.gensim_corpora
:members:
-Module `shorttext.utils.wordembed`
+Module `shorttext.utils.compactmodel_io`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: shorttext.utils.compactmodel_io
+ :members:
+
+
+Metrics
+-------
+
+Module `shorttext.metrics.dynprog`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. automodule:: shorttext.utils.wordembed
+.. automodule:: shorttext.metrics.dynprog.jaccard
:members:
-Module `shorttext.utils.compactmodel_io`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: shorttext.metrics.dynprog.dldist
+ :members:
-.. automodule:: shorttext.utils.compactmodel_io
+.. automodule:: shorttext.metrics.dynprog.lcp
:members:
-Stacked Generalization
-----------------------
+Module `shorttext.metrics.wassersterin`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. automodule:: shorttext.metrics.wasserstein.wordmoverdist
+ :members: word_mover_distance_linprog
-Module `shorttext.stack`
+Spell Correction
+----------------
+
+Module `shorttext.spell`
^^^^^^^^^^^^^^^^^^^^^^^^
-.. automodule:: shorttext.stack.stacking
+.. automodule:: shorttext.spell.basespellcorrector
:members:
+
+
+
+
+
Home: :doc:`index`
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index 744ae2ad..3676f518 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,19 +18,8 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.append(os.path.abspath('.'))
-sys.path.append(os.path.abspath('..'))
-sys.path.append(os.path.abspath('../shorttext'))
-sys.path.append(os.path.abspath('../shorttext/data'))
-sys.path.append(os.path.abspath('../shorttext/utils'))
-sys.path.append(os.path.abspath('../shorttext/classifiers'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/embed'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/embed/autoencode'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/embed/sumvec'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/embed/nnlib'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/bow'))
-sys.path.append(os.path.abspath('../shorttext/classifiers/bow/topic'))
-sys.path.append(os.path.abspath('../bin'))
+sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../src'))
# -- General configuration ------------------------------------------------
@@ -41,10 +30,10 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.mathjax'
+ 'sphinx.ext.mathjax', 'sphinx.ext.autodoc'
]
+
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -61,17 +50,17 @@
# General information about the project.
project = u'shorttext'
-copyright = u'2017, Kwan-Yuet Ho'
-author = u'Kwan-Yuet Ho'
+copyright = u'2017, Kwan Yuet Stephen Ho'
+author = u'Kwan Yuet Stephen Ho'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = u'0.3'
+version = u'3.0'
# The full version, including alpha/beta/rc tags.
-release = u'0.3.8'
+release = u'3.0.1'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -99,7 +88,7 @@
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-add_module_names = False
+#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
@@ -117,12 +106,19 @@
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
+# -- Options for Autodoc --------------------------------------------------
+
+# Mock imports for heavy dependencies
+autodoc_mock_imports = [
+ 'tensorflow', 'keras', 'gensim', 'numba', 'joblib'
+]
+
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'alabaster'
+html_theme = 'classic'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
diff --git a/docs/faq.rst b/docs/faq.rst
index f8b20188..27274134 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,59 +1,55 @@
Frequently Asked Questions (FAQ)
================================
-1. Can we use Tensorflow backend?
+**Q1. Can we use backends other than TensorFlow?**
-Ans: Yes, users can use tensorflow backend instead of theano backend, as both as supported
-by Keras. Refer to `Keras Backend
-`_ for information about switching backends.
+Ans: No.
-2. Can we use word-embedding algorithms other than Word2Vec?
-Ans: Currently only Word2Vec is directly supported. However, you can
-convert GloVe models into Word2Vec models. See: :doc:`tutorial_wordembed` .
+**Q2. Can we use word-embedding algorithms other than Word2Vec?**
-3. Can this package work on Python 3?
+Ans: Yes. Besides Word2Vec, you can use FastText and Poincaré embedding. See: :doc:`tutorial_wordembed` .
-Ans: This package is written in Python 2.7. It is not guaranteed that the package works perfectly
-well in Python 3.
-4. This package requires SpaCy, which involves loading several models that
-are needed for `shorttext` to run correctly. It gives error whenever I ran
-models that require tokenization. What should I do?
+**Q3. Can this package work on Python 2?**
-If your code gives the error message that includes the following:
+Ans: No.
-::
- ValueError: Found English model at //anaconda/lib/python2.7/site-packages/spacy/data/en-1.1.0.
- This model is not compatible with the current version.
- See https://spacy.io/docs/usage/models to download the new model.
-Then run the following command in your terminal or console:
+**Q4. How should I cite `shorttext` if I use it in my research?**
-::
+Ans: For the time being, You do not have to cite a particular paper for using this package.
+However, if you use any particular functions or class, check out the docstring. If there is a paper (or papers)
+mentioned, cite those papers. For example, if you use `CNNWordEmbed` in `frameworks
+`_,
+according to the docstring, cite Yoon Kim's paper. Refer to this documentation for the reference too.
- python -m spacy download en
-Refer to `spaCy webpage
-`_ for more information.
+**Q5. I am having trouble in install `shorttext` on Google Cloud Platform. What should I do?**
-5. Warning or messages pop up when running models involving neural networks. What is the problem?
+Ans: There is no "Python.h". Run: `sudo apt-get install python3-dev` in SSH shell of the VM instance.
-Make sure your `keras` have version >= 2.
-6. The following error message appears while loading shorttext:
-::
+**Q6. Where is the Sakiguchi spell corrector? **
- ImportError: dlopen: cannot load any more object with static TLS
+Ans: It was removed since release 3.0.0, but you can refer to the `examples\` folder in the
+Github repository for the codes.
-How do I deal with it?
-If you use Tensorflow as your backend, you may experience this problem. This has been pointed
-out by Yeung in the community: `issue
-`_ . You should either reload tensorflow,
-or reinstall, or try to workaround by importing `spaCy` before `shorttext`.
+
+**Q7. Where are `WrappedBERTEncoder` and `BERTScorer`?**
+
+Ans: It was removed since release 3.0.0, but you can install another package `shorttext-bert`
+to get the same functionality.
+
+
+
+**Q8. My model files were created by `shorttext` version < 2.0.0. How do I make them readable for version >= 2.0.0?
+
+Ans: Simply make those files with names ending with `.h5` to `.weights.h5`.
+
Home: :doc:`index`
diff --git a/docs/index.rst b/docs/index.rst
index 1d881df6..c623d70e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,7 +8,9 @@ Homepage of `shorttext`
This repository is a collection of algorithms for multi-class classification to short texts using Python.
Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report
-issues through the Issue_ tab of the Github_ page.
+issues through the Issue_ tab of the Github_ page. This is a PyPI_ project. This is an open-source
+project under the `MIT License
+`_ .
Contents:
@@ -18,8 +20,8 @@ Contents:
intro
install
tutorial
- codes
scripts
+ codes
faq
refs
links
@@ -27,6 +29,7 @@ Contents:
.. _Github: https://github.com/stephenhky/PyShortTextCategorization
.. _Issue: https://github.com/stephenhky/PyShortTextCategorization/issues
+.. _PyPI: https://pypi.org/project/shorttext/
Indices and tables
==================
diff --git a/docs/install.rst b/docs/install.rst
index d48ac600..855cbcf5 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -4,48 +4,66 @@ Installation
PIP
---
+Package `shorttext` runs in Python 3.9, 3.10, 3.11, and 3.12. However, for Python>=3.7, the backend
+of keras_ cannot be Tensorflow_.
+
To install the package in Linux or OS X, enter the following in the console:
::
- pip install -U shorttext
+ pip install shorttext
It is very possible that you have to do it as root, that you have to add ``sudo`` in
front of the command.
-However, the repository on Python Package Index is not always the most updated. To get
-the most updated (not official) version, you can install from Github_:
+On the other hand, to get the development version on Github, you can install from Github_:
::
- pip install -U git+https://github.com/stephenhky/PyShortTextCategorization
-
-By adding ``-U`` in the command, it automatically installs the required packages. If not,
-you have to install these packages on your own.
+ pip install git+https://github.com/stephenhky/PyShortTextCategorization@master
-.. _Github: https://github.com/stephenhky/PyShortTextCategorization
-Required Packages
+Backend for Keras
-----------------
-- Numpy_ (Numerical Python)
-- SciPy_ (Scientific Python)
-- Scikit-Learn_ (Machine Learning in Python)
-- Theano_ (Symbolic Computing for Deep Learning)
-- keras_ (Deep Learning Library for Theano and Tensorflow)
-- gensim_ (Topic Modeling for Humans)
-- Pandas_ (Python Data Analysis Library)
-- spaCy_ (Industrial Strenglth Natural Language Processing in Python)
-- stemming_ (stemming in Python)
+We use TensorFlow for `keras`.
+
+Possible Solutions for Installation Failures
+--------------------------------------------
+
+Most developers can install `shorttext` with the instructions above. If the installation fails,
+you may try one (or more) of the following:
+
+1. Installing Python-dev by typing:
+
+
+::
+
+ pip install python3-dev
+
+
+
+2. Installing `gcc` by entering
+
+::
+
+ apt-get install libc6
+
+
+
+.. _Github: https://github.com/stephenhky/PyShortTextCategorization
+
Home: :doc:`index`
.. _Numpy: http://www.numpy.org/
.. _SciPy: https://www.scipy.org/
.. _Scikit-Learn: http://scikit-learn.org/stable/
+.. _Tensorflow: https://www.tensorflow.org/
.. _Theano: http://deeplearning.net/software/theano/
+.. _CNTK: https://github.com/Microsoft/CNTK/wiki
.. _keras: https://keras.io/
.. _gensim: https://radimrehurek.com/gensim/
.. _Pandas: http://pandas.pydata.org/
-.. _spaCy: https://spacy.io/
-.. _stemming: https://pypi.python.org/pypi/stemming/
+.. _snowballstemmer: https://github.com/snowballstem/snowball
+.. _Joblib: https://joblib.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/docs/intro.rst b/docs/intro.rst
index 1d3e5c72..078015af 100644
--- a/docs/intro.rst
+++ b/docs/intro.rst
@@ -1,13 +1,15 @@
Introduction
============
-This package `shorttext` is a Python package that facilitates supervised
+This package `shorttext` is a Python package that facilitates supervised and unsupervised
learning for short text categorization. Due to the sparseness of words and
the lack of information carried in the short texts themselves, an intermediate
representation of the texts and documents are needed before they are put into
any classification algorithm. In this package, it facilitates various types
of these representations, including topic modeling and word-embedding algorithms.
+The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12.
+
Characteristics:
- example data provided (including subject keywords and NIH RePORT); (see :doc:`tutorial_dataprep`)
@@ -15,10 +17,22 @@ Characteristics:
- pre-trained word-embedding support; (see :doc:`tutorial_wordembed`)
- `gensim` topic models (LDA, LSI, Random Projections) and autoencoder; (see :doc:`tutorial_topic`)
- topic model representation supported for supervised learning using `scikit-learn`; (see :doc:`tutorial_topic`)
-- cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_umvec`) and
-- neural network classification (including ConvNet, and C-LSTM). (see :doc:`tutorial_nnlib`)
+- cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_sumvec`)
+- neural network classification (including ConvNet, and C-LSTM); (see :doc:`tutorial_nnlib`)
+- maximum entropy classification; (see :doc:`tutorial_maxent`)
+- metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); (see :doc:`tutorial_metrics`)
+- character-level sequence-to-sequence (seq2seq) learning; (see :doc:`tutorial_charbaseseq2seq`)
+- spell correction; (see :doc:`tutorial_spell`)
+
+Author: Kwan Yuet Stephen Ho (LinkedIn_, ResearchGate_)
+Other contributors: `Chinmaya Pancholi `_, `Minseo Kim `_
+
+Contribution
+------------
-Author: Kwan-Yuet Ho (LinkedIn_, ResearchGate_)
+If you would like to contribute, feel free to submit the pull requests to the `develop` branch.
+You can talk to me in advance through e-mails or the `Issues
+`_ page.
Home: :doc:`index`
diff --git a/docs/links.rst b/docs/links.rst
index ddb16b80..1608ea36 100644
--- a/docs/links.rst
+++ b/docs/links.rst
@@ -9,7 +9,14 @@ Project Codes and Package
.. _Github: https://github.com/stephenhky/PyShortTextCategorization
-.. _PyPI: https://pypi.python.org/pypi/shorttext
+.. _PyPI: https://pypi.org/project/shorttext/
+
+Issues
+------
+
+To report bugs and issues, please go to Issues_.
+
+.. _Issues: https://github.com/stephenhky/PyShortTextCategorization/issues
Gensim Incubator
----------------
@@ -19,10 +26,12 @@ by Google Summer of Code (GSoC) project to support the open-source project for `
Part of his project is to employ the wrapping ideas in `shorttext` to integrate `keras`,
`scikit-learn` and `gensim`.
-Chinmaya's blog post:
+Chinmaya's blog posts: `https://rare-technologies.com/author/chinmaya/
+`_
+
+Chinmaya's proposal for GSoC: `https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md
+`_
-* `Google Summer of Code 2017 – Week 1 on Integrating Gensim with scikit-learn and Keras
- `_
Blog Entries
------------
@@ -44,5 +53,22 @@ Blog Entries
"Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress
`_]
+"Python Package for Short Text Mining," *Everything About Data Analytics*, WordPress (2016). [`WordPress
+`_]
+
+"Short Text Mining using Advanced Keras Layers and Maxent: shorttext 0.4.1," *Everything About Data Analytics*, WordPress (2017). [`WordPress
+`_]
+
+"Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress
+`_]
+
+"Release of shorttext 0.5.4," *Everything About Data Analytics*, WordPress (2017). [`WordPress
+`_]
+
+"Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress
+`_]
+
+"Package shorttext 1.0.0 Released," Medium (2018). [`Medium
+`_]
Home: :doc:`index`
\ No newline at end of file
diff --git a/docs/news.rst b/docs/news.rst
index bc83b6aa..5fd66644 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -1,6 +1,74 @@
News
====
+* 10/27/2025: `shorttext` 3.0.1 released.
+* 08/10/2025: `shorttext` 3.0.0 released.
+* 06/02/2025: `shorttext` 2.2.1 released.
+* 05/29/2025: `shorttext` 2.2.0 released.
+* 05/08/2025: `shorttext` 2.1.1 released.
+* 12/14/2024: `shorttext` 2.1.0 released.
+* 07/12/2024: `shorttext` 2.0.0 released.
+* 12/21/2023: `shorttext` 1.6.1 released.
+* 08/26/2023: `shorttext` 1.6.0 released.
+* 06/19/2023: `shorttext` 1.5.9 released.
+* 09/23/2022: `shorttext` 1.5.8 released.
+* 09/22/2022: `shorttext` 1.5.7 released.
+* 08/29/2022: `shorttext` 1.5.6 released.
+* 05/28/2022: `shorttext` 1.5.5 released.
+* 12/15/2021: `shorttext` 1.5.4 released.
+* 07/11/2021: `shorttext` 1.5.3 released.
+* 07/06/2021: `shorttext` 1.5.2 released.
+* 04/10/2021: `shorttext` 1.5.1 released.
+* 04/09/2021: `shorttext` 1.5.0 released.
+* 02/11/2021: `shorttext` 1.4.8 released.
+* 01/11/2021: `shorttext` 1.4.7 released.
+* 01/03/2021: `shorttext` 1.4.6 released.
+* 12/28/2020: `shorttext` 1.4.5 released.
+* 12/24/2020: `shorttext` 1.4.4 released.
+* 11/10/2020: `shorttext` 1.4.3 released.
+* 10/18/2020: `shorttext` 1.4.2 released.
+* 09/23/2020: `shorttext` 1.4.1 released.
+* 09/02/2020: `shorttext` 1.4.0 released.
+* 07/23/2020: `shorttext` 1.3.0 released.
+* 06/05/2020: `shorttext` 1.2.6 released.
+* 05/20/2020: `shorttext` 1.2.5 released.
+* 05/13/2020: `shorttext` 1.2.4 released.
+* 04/28/2020: `shorttext` 1.2.3 released.
+* 04/07/2020: `shorttext` 1.2.2 released.
+* 03/23/2020: `shorttext` 1.2.1 released.
+* 03/21/2020: `shorttext` 1.2.0 released.
+* 12/01/2019: `shorttext` 1.1.6 released.
+* 09/24/2019: `shorttext` 1.1.5 released.
+* 07/20/2019: `shorttext` 1.1.4 released.
+* 07/07/2019: `shorttext` 1.1.3 released.
+* 06/05/2019: `shorttext` 1.1.2 released.
+* 04/23/2019: `shorttext` 1.1.1 released.
+* 03/03/2019: `shorttext` 1.1.0 released.
+* 02/14/2019: `shorttext` 1.0.8 released.
+* 01/30/2019: `shorttext` 1.0.7 released.
+* 01/29/2019: `shorttext` 1.0.6 released.
+* 01/13/2019: `shorttext` 1.0.5 released.
+* 10/03/2018: `shorttext` 1.0.4 released.
+* 08/06/2018: `shorttext` 1.0.3 released.
+* 07/24/2018: `shorttext` 1.0.2 released.
+* 07/17/2018: `shorttext` 1.0.1 released.
+* 07/14/2018: `shorttext` 1.0.0 released.
+* 06/18/2018: `shorttext` 0.7.2 released.
+* 05/30/2018: `shorttext` 0.7.1 released.
+* 05/17/2018: `shorttext` 0.7.0 released.
+* 02/27/2018: `shorttext` 0.6.0 released.
+* 01/19/2018: `shorttext` 0.5.11 released.
+* 01/15/2018: `shorttext` 0.5.10 released.
+* 12/14/2017: `shorttext` 0.5.9 released.
+* 11/08/2017: `shorttext` 0.5.8 released.
+* 10/27/2017: `shorttext` 0.5.7 released.
+* 10/17/2017: `shorttext` 0.5.6 released.
+* 09/28/2017: `shorttext` 0.5.5 released.
+* 09/08/2017: `shorttext` 0.5.4 released.
+* 09/02/2017: end of GSoC project.
+* 08/22/2017: `shorttext` 0.5.1 released.
+* 07/28/2017: `shorttext` 0.4.1 released.
+* 07/26/2017: `shorttext` 0.4.0 released.
* 06/16/2017: `shorttext` 0.3.8 released.
* 06/12/2017: `shorttext` 0.3.7 released.
* 06/02/2017: `shorttext` 0.3.6 released.
@@ -17,14 +85,429 @@ News
* 11/21/2016: `shorttext` 0.1.1 released.
What's New
-==========
+----------
+
+Release 3.0.1 (October 25, 2025)
+--------------------------------
+
+* Small bugs fixed.
+
+Release 3.0.0 (August 10, 2025)
+-------------------------------
+
+* Introduction of Github workflow, publishing package directly to PyPI from Github;
+* Removal of Sakiguchi spell corrector; (refer to the `examples\` folder in the repository)
+* Removal of `WrappedBERTEncoder` and `BERTScorer`; (they can be installed from the package `shorttext-bert
+`_)
+* Update of documentation.
+
+
+Release 2.2.1 (June 2, 2025)
+----------------------------
+
+* Code cleanup for token categorization. (Acknowledgements: Minseo Kim)
+
+
+Release 2.2.0 (May 29, 2025)
+----------------------------
+
+* Update `keras` to `tensorflow.keras`. (Acknowledgements: Minseo Kim)
+
+
+Release 2.1.1 (May 8, 2025)
+---------------------------
+
+* Update of Snowball stemmer;
+* Codes cleaned up.
+
+Release 2.1.0 (December 14, 2024)
+---------------------------------
+
+* Use of `pyproject.toml` for package distribution.
+* Removed Cython components.
+* Huge relative import refactoring.
+
+Release 2.0.0 (July 13, 2024)
+-----------------------------
+
+* Decommissioned support for Python 3.8.
+* Added support for Python 3.12.
+* Updated file extensions for model files.
+
+Release 1.6.1 (December 21, 2023)
+---------------------------------
+
+* Updated package requirements.
+
+Release 1.6.0 (August 26, 2023)
+-------------------------------
+
+* Pinned requirements for ReadTheDocs documentation;
+* Fixed bugs in word-embedding model mean pooling classifiers;
+* Updated package requirements.
+
+
+Release 1.5.9 (June 19, 2023)
+-----------------------------
+
+* Support for Python 3.11;
+* Removing flask.
+
+Release 1.5.8 (September 23, 2022)
+----------------------------------
+
+* Package administration.
+
+Release 1.5.7 (September 22, 2022)
+----------------------------------
+
+* Removal of requirement of pre-installation of `numpy` and `Cython`.
+
+Release 1.5.6 (August 29, 2022)
+-------------------------------
+
+* Speeding up inference of `VarNNEmbeddedVecClassifier`. (Acknowledgement: Ritesh Agrawal)
+
+Release 1.5.5 (May 28, 2022)
+-----------------------------
+
+* Support for Python 3.10.
+
+
+Release 1.5.4 (December 15, 2021)
+-----------------------------
+
+* Non-negative stop words.
+
+Release 1.5.3 (July 11, 2021)
+-----------------------------
+
+* Documentation updated.
+
+Release 1.5.2 (July 6, 2021)
+----------------------------
+
+* Resolved bugs regarding `keras` import.
+* Support for Python 3.9.
+
+Release 1.5.1 (April 10, 2021)
+------------------------------
+
+* Replaced TravisCI with CircleCI in the continuous integration pipeline.
+
+Release 1.5.0 (April 09, 2021)
+------------------------------
+
+* Removed support for Python 3.6.
+* Removed buggy BERT representations unit test.
+
+Release 1.4.8 (February 11, 2021)
+---------------------------------
+
+* Updated requirements for `scipy` for Python 3.7 or above.
+
+Release 1.4.7 (January 11, 2021)
+--------------------------------
+
+* Updated version of `transformers` in `requirement.txt`;
+* Updated BERT encoder for the change of implementation;
+* Fixed unit tests.
+
+Release 1.4.6 (January 3, 2021)
+-------------------------------
+
+* Bug regarding Python 3.6 requirement for `scipy`.
+
+Release 1.4.5 (December 28, 2020)
+---------------------------------
+
+* Bugs fixed about Python 2 to 3 updates, `filter` in `shorttext.metrics.embedfuzzy`.
+
+Release 1.4.4 (December 24, 2020)
+---------------------------------
+
+* Bugs regarding `SumEmbedVeccClassification.py`;
+* Fixing bugs due to Python 3.6 restriction on `scipy`.
+
+
+Release 1.4.3 (November 10, 2020)
+---------------------------------
+
+* Bugs about transformer-based model on different devices resolved.
+
+Release 1.4.2 (October 18, 2020)
+----------------------------------
+
+* Documentation requirements and PyUp configs cleaned up.
+
+Release 1.4.1 (September 23, 2020)
+----------------------------------
+
+* Documentation and codes cleaned up.
+
+Release 1.4.0 (September 2, 2020)
+---------------------------------
+
+* Provided support BERT-based sentence and tokens embeddings;
+* Implemented support for BERTScores.
+
+Release 1.3.0 (July 23, 2020)
+-----------------------------
+
+* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`.
+
+Release 1.2.6 (June 20, 2020)
+-----------------------------
+
+* Removed Python-2 codes (`urllib2`).
+
+Release 1.2.5 (May 20, 2020)
+----------------------------
+
+* Update on `gensim` package usage and requirements;
+* Removed some deprecated functions.
+
+Release 1.2.4 (May 13, 2020)
+----------------------------
+
+* Update on `scikit-learn` requirements to `>=0.23.0`.
+* Directly dependence on `joblib`;
+* Support for Python 3.8 added.
+
+Release 1.2.3 (April 28, 2020)
+------------------------------
+
+* PyUP scan implemented;
+* Support for Python 3.5 decommissioned.
+
+Release 1.2.2 (April 7, 2020)
+-----------------------------
+
+* Removed dependence on `PyStemmer`, which is replaced by `snowballstemmer`.
+
+Release 1.2.1 (March 23, 2020)
+------------------------------
+
+* Added port number adjustability for word-embedding API;
+* Removal of Spacy dependency.
+
+Release 1.2.0 (March 21, 2020)
+------------------------------
+
+* API for word-embedding algorithm for one-time loading.
+
+
+Release 1.1.6 (December 1, 2019)
+--------------------------------
+
+* Compatibility with TensorFlow 2.0.0.
+
+
+Release 1.1.5 (September 24, 2019)
+----------------------------------
+
+* Decommissioned GCP buckets; using data files stored in AWS S3 buckets.
+
+
+Release 1.1.4 (July 20, 2019)
+-----------------------------
+
+* Minor bugs fixed.
+
+Release 1.1.3 (July 7, 2019)
+----------------------------
+
+* Updated codes for Console code loading;
+* Updated Travis CI script.
+
+Release 1.1.2 (June 5, 2019)
+-----------------------------
+
+* Updated codes for Fasttext moddel loading as the previous function was deprecated.
+
+Release 1.1.1 (April 23, 2019)
+------------------------------
+
+* Bug fixed. (Acknowledgement: `Hamish Dickson
+ `_ )
+
+Release 1.1.0 (March 3, 2019)
+-----------------------------
+
+* Size of embedded vectors set to 300 again when necessary; (possibly break compatibility)
+* Moving corpus data from Github to Google Cloud Storage.
+
+
+Release 1.0.8 (February 14, 2019)
+---------------------------------
+
+* Minor bugs fixed.
+
+
+Release 1.0.7 (January 30, 2019)
+--------------------------------
+
+* Compatibility with Python 3.7 with TensorFlow as the backend.
+
+Release 1.0.7 (January 30, 2019)
+--------------------------------
+
+* Compatibility with Python 3.7 with Theano as the backend;
+* Minor documentation changes.
+
+
+Release 1.0.6 (January 29, 2019)
+--------------------------------
+
+* Documentation change;
+* Word-embedding model used in unit test stored in Amazon S3 bucket.
+
+
+Release 1.0.5 (January 13, 2019)
+--------------------------------
+
+* Minor versioning bug fixed.
+
+
+Release 1.0.4 (October 3, 2018)
+-------------------------------
+
+* Package `keras` requirement updated;
+* Less dependence on `pandas`.
+
+
+Release 1.0.3 (August 6, 2018)
+------------------------------
+
+* Bugs regarding I/O of `SumEmbeddedVecClassifier`.
+
+Release 1.0.2 (July 24, 2018)
+-----------------------------
+
+* Minor bugs regarding installation fixed.
+
+Release 1.0.1 (July 14, 2018)
+-----------------------------
+
+* Minor bugs fixed.
+
+Release 1.0.0 (July 14, 2018)
+-----------------------------
+
+* Python-3 compatibility;
+* Replacing the original stemmer to use Snowball;
+* Certain functions cythonized;
+* Various bugs fixed.
+
+Release 0.7.2 (June 18, 2018)
+-----------------------------
+
+* Damerau-Levenshtein distance and longest common prefix implemented using Cython.
+
+Release 0.7.1 (May 30, 2018)
+----------------------------
+
+* Decorator replaced by base class `CompactIOMachine`;
+* API included in documentation.
+
+
+Release 0.7.0 (May 17, 2018)
+----------------------------
+
+* Spelling corrections and fuzzy logic;
+* More unit tests.
+
+
+Release 0.6.0 (February 27, 2018)
+---------------------------------
+
+* Support of character-based sequence-to-sequence (seq2seq) models.
+
+
+Release 0.5.11 (January 19, 2018)
+---------------------------------
+
+* Removal of word-embedding `keras`-type layers.
+
+Release 0.5.10 (January 15, 2018)
+---------------------------------
+
+* Support of encoder module for character-based models;
+* Implementation of document-term matrix (DTM).
+
+Release 0.5.9 (December 14, 2017)
+---------------------------------
+
+* Support of Poincare embedding;
+* Code optimization;
+* Script `ShortTextWord2VecSimilarity` updated to `ShortTextWordEmbedSimilarity`.
+
+Release 0.5.8 (November 8, 2017)
+--------------------------------
+
+* Removed most explicit user-specification of `vecsize` for given word-embedding models;
+* Removed old namespace for topic models (no more backward compatibility).
+* Integration of [FastText](https://github.com/facebookresearch/fastText).
+
+
+Release 0.5.7 (October 27, 2017)
+--------------------------------
+
+* Removed most explicit user-specification of `vecsize` for given word-embedding models;
+* Removed old namespace for topic models (hence no more backward compatibility).
+
+Release 0.5.6 (October 17, 2017)
+--------------------------------
+
+* Updated the neural network framework due to the change in `gensim` API.
+
+Release 0.5.5 (September 28, 2017)
+----------------------------------
+
+* Script `ShortTextCategorizerConsole` updated.
+
+Release 0.5.4 (September 8, 2017)
+---------------------------------
+
+* Bug fixed;
+* New scripts for finding distances between sentences;
+* Finding similarity between two sentences using Jaccard index.
+
+End of GSoC Program (September 2, 2017)
+---------------------------------------
+
+Chinmaya summarized his GSoC program in his blog post posted in `RaRe Incubator
+`_.
+
+
+Release 0.5.1 (August 22, 2017)
+-------------------------------
+
+* Implementation of Damerau-Levenshtein distance and soft Jaccard score;
+* Implementation of Word Mover's distance.
+
+
+Release 0.4.1 (July 28, 2017)
+-----------------------------
+
+* Further Travis.CI update tests;
+* Model file I/O updated (for huge models);
+* Migrating documentation to [readthedocs.org](readthedocs.org); previous documentation at `Pythonhosted.org` destroyed.
+
+
+Release 0.4.0 (July 26, 2017)
+-----------------------------
+
+* Maximum entropy models;
+* Use of `gensim` Word2Vec `keras` layers;
+* Incorporating new features from `gensim`;
+* Use of Travis.CI for pull request testing.
Release 0.3.8 (June 16, 2017)
-----------------------------
* Bug fixed on `sumvecframeworks`.
-
Release 0.3.7 (June 12, 2017)
-----------------------------
@@ -38,7 +521,7 @@ Release 0.3.6 (June 2, 2017)
* Added "update" corpus capability to `gensim` models.
Google Summer of Code (May 30, 2017)
------------------------------------------
+------------------------------------
Chinamaya Pancholi, a Google Summer of Code (GSoC) student, is involved in
the open-source development of `gensim`, that his project will be very related
diff --git a/docs/refs.rst b/docs/refs.rst
index 8ccc05da..75b7de5a 100644
--- a/docs/refs.rst
+++ b/docs/refs.rst
@@ -1,7 +1,17 @@
References
==========
-Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996).
+Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM
+`_]
+
+Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
+`_]
+
+Chinmaya Pancholi, "Gensim integration with scikit-learn and Keras," *Google Summer of Codes* (GSoC) proposal (2017). [`Github
+`_]
+
+Chinmaya Pancholi, "Chinmaya’s GSoC 2017 Summary: Integration with sklearn & Keras and implementing fastText," *RaRe Incubator* (September 2, 2017). [`RaRe
+`_]
Christopher Manning, Hinrich Schütze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press
`_]
@@ -12,28 +22,73 @@ Christopher D. Manning, Prabhakar Raghavan, Hinrich Schütze, *Introduction to I
Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv
`_]
+Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
+`_]
+
+Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ
+`_]
+
+Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare
+`_]
+
David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992).
-David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012).
+David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012). [`ACM
+`_]
+
+Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras
+`_]
Francois Chollet, "Building Autoencoders in Keras," *The Keras Blog*. [`Keras
`_]
-Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, and Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU
+Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU
`_]
+Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto
+`_]
+
+Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv
+`_]
+
+Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe
+`_]
+
Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF
`_]
+Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv
+`_]
+
"Keras 2.0 Release Notes." (2017) [`Github
`_]
+Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
+
+Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv
+`_]
+
Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs
`_]
M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
*WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
+Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv
+`_]
+
+Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv
+`_]
+
+Peter Norvig, "How to write a spell corrector." (2016) [`Norvig
+`_]
+
+Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv
+`_]
+
+Radim Rehurek, Petr Sojka, "Software Framework for Topic Modelling with Large Corpora," In Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks (2010). [`ResearchGate
+`_]
+
Sebastian Ruder, "An overview of gradient descent optimization algorithms," blog of Sebastian Ruder, arXiv:1609.04747 (2016). [`Ruder
`_ or `arXiv
`_]
@@ -48,6 +103,9 @@ Thomas W. Jones, "textmineR: Functions for Text Mining and Topic Modeling," CRAN
Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv
`_]
+Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv
+`_]
+
Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
"A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
*IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..165c1f4f
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,14 @@
+numpy==2.4.0
+scipy==1.16.3
+joblib==1.5.3
+scikit-learn==1.8.0
+tensorflow==2.20.0
+keras==3.13.0
+gensim==4.4.0
+pandas==2.3.3
+snowballstemmer==3.0.1
+transformers==4.57.3
+torch==2.9.1
+numba==0.63.1
+npdict==0.0.5
+nptyping==2.5.0
diff --git a/docs/requirements_minimal.txt b/docs/requirements_minimal.txt
new file mode 100644
index 00000000..31c012cf
--- /dev/null
+++ b/docs/requirements_minimal.txt
@@ -0,0 +1,5 @@
+numpy>=1.23.3
+scipy>=1.12.0
+snowballstemmer>=3.0.0
+scikit-learn>=1.2.0
+pandas>=1.2.0
\ No newline at end of file
diff --git a/docs/scripts.rst b/docs/scripts.rst
index c70c9356..a729b6ed 100644
--- a/docs/scripts.rst
+++ b/docs/scripts.rst
@@ -10,33 +10,46 @@ ShortTextCategorizerConsole
::
- usage: ShortTextCategorizerConsole [-h] [--wv WV] [--topn TOPN] model_filepath
+ usage: ShortTextCategorizerConsole [-h] [--wv WV] [--vecsize VECSIZE]
+ [--topn TOPN] [--inputtext INPUTTEXT]
+ [--type TYPE]
+ model_filepath
Perform prediction on short text with a given trained model.
positional arguments:
- model_filepath Path of the trained (compact) model.
+ model_filepath Path of the trained (compact) model.
- optional arguments:
- -h, --help show this help message and exit
- --wv WV Path of the pre-trained Word2Vec model. (None if not needed)
- --topn TOPN Number of top-scored results displayed. (Default: 10)
+ options:
+ -h, --help show this help message and exit
+ --wv WV Path of the pre-trained Word2Vec model. (None if not
+ needed)
+ --vecsize VECSIZE Vector dimensions. (Default: 300)
+ --topn TOPN Number of top-scored results displayed. (Default: 10)
+ --inputtext INPUTTEXT
+ single input text for classification. Run console if
+ set to None. (Default: None)
+ --type TYPE Type of word-embedding model (default: "word2vec";
+ other options: "fasttext", "poincare",
+ "word2vec_nonbinary", "poincare_binary")
-ShortTextWord2VecSimilarity
----------------------------
+ShortTextWordEmbedSimilarity
+----------------------------
::
- usage: ShortTextWord2VecSimilarity [-h] word2vec_modelpath
+ usage: ShortTextWordEmbedSimilarity [-h] [--type TYPE] modelpath
- Find the similarity between two short sentences using Word2Vec.
+ Find the similarities between two short sentences using Word2Vec.
positional arguments:
- word2vec_modelpath Path of the Word2Vec model
+ modelpath Path of the Word2Vec model
optional arguments:
- -h, --help show this help message and exit
+ -h, --help show this help message and exit
+ --type TYPE Type of word-embedding model (default: "word2vec"; other
+ options: "fasttext", "poincare")
Home: :doc:`index`
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index e74c13f1..16ff691b 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -8,20 +8,22 @@ Before using, type
>>> import shorttext
-You will get the message that `Theano` or `Tensorflow` backend is used for `keras`. Refer to `Keras Backend
-`_ for information about switching backends.
-
.. toctree::
:maxdepth: 2
tutorial_dataprep
tutorial_textpreprocessing
+ tutorial_dtm
+ tutorial_charbaseonehot
tutorial_topic
tutorial_wordembed
tutorial_sumvec
tutorial_nnlib
tutorial_maxent
+ tutorial_charbaseseq2seq
tutorial_stacking
+ tutorial_metrics
+ tutorial_spell
Home: :doc:`index`
diff --git a/docs/tutorial_charbaseonehot.rst b/docs/tutorial_charbaseonehot.rst
new file mode 100644
index 00000000..bd458756
--- /dev/null
+++ b/docs/tutorial_charbaseonehot.rst
@@ -0,0 +1,58 @@
+Character to One-Hot Vector
+===========================
+
+Since version 0.6.1, the package `shorttext` deals with character-based model. A first important
+component of character-based model is to convert every character to a one-hot vector. We provide a class
+:class:`shorttext.generators.SentenceToCharVecEncoder` to deal with this. Thi class incorporates
+the `OneHotEncoder` in `scikit-learn` and `Dictionary` in `gensim`.
+
+To use this, import the packages first:
+
+>>> import numpy as np
+>>> import shorttext
+
+Then we incorporate a text file as the source of all characters to be coded. In this case, we choose
+the file `big.txt` in Peter Norvig's websites:
+
+>>> from urllib.request import urlopen
+>>> textfile = urlopen('http://norvig.com/big.txt', 'r')
+
+Then instantiate the class using the function :func:`shorttext.generators.initSentenceToCharVecEncoder`:
+
+>>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(textfile)
+
+Now, the object `chartovec_encoder` is an instance of :class:`shorttext.generators.SentenceToCharVecEncoder` . The
+default signal character is `\n`, which is also encoded, and can be checked by looking at the field:
+
+>>> chartovec_encoder.signalchar
+
+We can convert a sentence into a bunch of one-hot vectors in terms of a matrix. For example,
+
+>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100)
+<1x93 sparse matrix of type ''
+ with 1 stored elements in Compressed Sparse Column format>
+
+This outputs a sparse matrix. Depending on your needs, you can add signal character to the beginning
+or the end of the sentence in the output matrix by:
+
+>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=True, endsig=False)
+>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=False, endsig=True)
+
+We can also convert a list of sentences by
+
+>>> chartovec_encoder.encode_sentences(sentences, 100, startsig=False, endsig=True, sparse=False)
+
+You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`.
+
+
+.. automodule:: shorttext.generators.charbase.char2vec
+ :members:
+
+
+Reference
+---------
+
+Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
+`_]
+
+Home: :doc:`index`
\ No newline at end of file
diff --git a/docs/tutorial_charbaseseq2seq.rst b/docs/tutorial_charbaseseq2seq.rst
new file mode 100644
index 00000000..4b7d085e
--- /dev/null
+++ b/docs/tutorial_charbaseseq2seq.rst
@@ -0,0 +1,86 @@
+Character-Based Sequence-to-Sequence (seq2seq) Models
+=====================================================
+
+Since release 0.6.0, `shorttext` supports sequence-to-sequence (seq2seq) learning. While there is a general seq2seq class
+behind, it provides a character-based seq2seq implementation.
+
+Creating One-hot Vectors
+------------------------
+
+To use it, create an instance of the class :class:`shorttext.generators.SentenceToCharVecEncoder`:
+
+>>> import numpy as np
+>>> import shorttext
+>>> from urllib.request import urlopen
+>>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r'))
+
+The above code is the same as :doc:`tutorial_charbaseonehot` .
+
+.. automodule:: shorttext.generators.charbase.char2vec
+ :members: initSentenceToCharVecEncoder
+
+
+Training
+--------
+
+Then we can train the model by creating an instance of :class:`shorttext.generators.CharBasedSeq2SeqGenerator`:
+
+>>> latent_dim = 100
+>>> seq2seqer = shorttext.generators.CharBasedSeq2SeqGenerator(chartovec_encoder, latent_dim, 120)
+
+And then train this neural network model:
+
+>>> seq2seqer.train(text, epochs=100)
+
+This model takes several hours to train on a laptop.
+
+
+.. autoclass:: shorttext.generators.seq2seq.charbaseS2S.CharBasedSeq2SeqGenerator
+ :members:
+
+Decoding
+--------
+
+After training, we can use this class as a generative model
+of answering questions as a chatbot:
+
+>>> seq2seqer.decode('Happy Holiday!')
+
+It does not give definite answers because there is a stochasticity in the prediction.
+
+Model I/O
+---------
+
+This model can be saved by entering:
+
+>>> seq2seqer.save_compact_model('/path/to/norvigtxt_iter5model.bin')
+
+And can be loaded by:
+
+>>> seq2seqer2 = shorttext.generators.seq2seq.charbaseS2S.loadCharBasedSeq2SeqGenerator('/path/to/norvigtxt_iter5model.bin')
+
+.. automodule:: shorttext.generators.seq2seq.charbaseS2S
+ :members: loadCharBasedSeq2SeqGenerator
+
+
+Reference
+---------
+
+Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
+`_]
+
+Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto
+`_]
+
+Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv
+`_]
+
+Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv
+`_]
+
+Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv
+`_]
+
+Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv
+`_]
+
diff --git a/docs/tutorial_dataprep.rst b/docs/tutorial_dataprep.rst
index 25080bf6..9d7f9d4e 100644
--- a/docs/tutorial_dataprep.rst
+++ b/docs/tutorial_dataprep.rst
@@ -32,6 +32,10 @@ the subject keywords, as below:
'Holy Trinity', 'eschatology', 'scripture', 'ecclesiology', 'predestination',
'divine degree', 'creedal confessionalism', 'scholasticism', 'prayer', 'eucharist']}
+
+.. automodule:: shorttext.data.data_retrieval
+ :members: subjectkeywords
+
Example Training Data 2: NIH RePORT
-----------------------------------
@@ -55,9 +59,9 @@ randomly drawn from the original data.
However, there are other configurations:
-.. autofunction:: shorttext.data.nihreports
+.. automodule:: shorttext.data.data_retrieval
+ :members: nihreports
-If `sample_size` is specified to be `None`, all the data will be retrieved without sampling.
Example Training Data 3: Inaugural Addresses
--------------------------------------------
@@ -73,7 +77,8 @@ Enter:
>>> trainclassdict = shorttext.data.inaugural()
-.. autfunction:: shorttext.data.inaugural
+.. automodule:: shorttext.data.data_retrieval
+ :members: inaugural
User-Provided Training Data
@@ -110,4 +115,8 @@ To load this data file, just enter:
>>> trainclassdict = shorttext.data.retrieve_csvdata_as_dict('/path/to/file.csv')
+.. automodule:: shorttext.data.data_retrieval
+ :members: retrieve_csvdata_as_dict
+
+
Home: :doc:`index`
diff --git a/docs/tutorial_dtm.rst b/docs/tutorial_dtm.rst
new file mode 100644
index 00000000..005e625a
--- /dev/null
+++ b/docs/tutorial_dtm.rst
@@ -0,0 +1,82 @@
+Document-Term Matrix
+====================
+
+Preparing for the Corpus
+------------------------
+
+We can create and handle document-term matrix (DTM) with `shorttext`. Use the dataset of Presidents'
+Inaugural Addresses as an example.
+
+>>> import shorttext
+>>> usprez = shorttext.data.inaugural()
+
+We have to make each presidents' address to be one document to achieve our purpose. Enter this:
+
+>>> docids = sorted(usprez.keys())
+>>> usprez = [' '.join(usprez[docid]) for docid in docids]
+
+Now the variable `usprez` is a list of 56 Inaugural Addresses from George Washington (1789) to
+Barack Obama (2009), with the IDs stored in `docids`. We apply the standard text preprocessor and
+produce a list of lists (of tokens) (or a corpus in `gensim`):
+
+>>> preprocess = shorttext.utils.standard_text_preprocessor_1()
+>>> corpus = [preprocess(address).split(' ') for address in usprez]
+
+Then now the variable `corpus` is a list of lists of tokens. For example,
+
+>>> corpus[0] # shows all the preprocessed tokens of the first Presidential Inaugural Addresses
+
+Using Class `DocumentTermMatrix`
+--------------------------------
+
+With the corpus ready in this form, we can create a `DocumentTermMatrix` class for DTM by:
+
+>>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids)
+
+.. autoclass:: shorttext.utils.dtm.DocumentTermMatrix
+ :members:
+
+One can get the document frequency of any token (the number of documents that the given
+token is in) by:
+
+>>> usprez_dtm.get_doc_frequency('peopl') # gives 54, the document frequency of the token "peopl"
+
+or the total term frequencies (the total number of occurrences of the given tokens in all documents) by:
+
+>>> usprez_dtm.get_total_termfreq('justic') # gives 134.0, the total term frequency of the token "justic"
+
+or the term frequency for a token in a given document by:
+
+>>> usprez_dtm.get_termfreq('2009-Obama', 'chang') # gives 2.0
+
+We can also query the number of occurrences of a particular word of all documents,
+stored in a dictionary, by:
+
+>>> usprez_dtm.get_token_occurences('god')
+
+Of course, we can always reweigh the counts above (except document frequency) by imposing
+tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`:
+
+>>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
+
+To save the class, enter:
+
+>>> usprez_dtm.save_compact_model('/path/to/whatever.bin')
+
+To load this class later, enter:
+
+>>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin')
+
+.. automodule:: shorttext.utils.dtm
+ :members: load_DocumentTermMatrix
+
+Reference
+---------
+
+Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press
+`_]
+
+"Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress
+`_]
+
+Home: :doc:`index`
\ No newline at end of file
diff --git a/docs/tutorial_maxent.rst b/docs/tutorial_maxent.rst
index 407a4234..2eec0911 100644
--- a/docs/tutorial_maxent.rst
+++ b/docs/tutorial_maxent.rst
@@ -1,6 +1,9 @@
Maximum Entropy (MaxEnt) Classifier
===================================
+Maxent
+------
+
Maximum entropy (maxent) classifier has been a popular text classifier, by parameterizing the model
to achieve maximum categorical entropy, with the constraint that the resulting probability
on the training data with the model being equal to the real distribution.
@@ -35,8 +38,27 @@ After training, it can be used for classification, such as
To save the model,
->>> classifier.score('/path/to/filename.bin')
+>>> classifier.save_compact_model('/path/to/filename.bin')
To load the model to be a classifier, enter:
>>> classifier2 = shorttext.classifiers.load_maxent_classifier('/path/to/filename.bin')
+
+
+.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification
+ :members:
+
+
+Reference
+---------
+
+Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM
+`_]
+
+Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ
+`_]
+
+Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare
+`_]
+
+Home: :doc:`index`
\ No newline at end of file
diff --git a/docs/tutorial_metrics.rst b/docs/tutorial_metrics.rst
new file mode 100644
index 00000000..920e3648
--- /dev/null
+++ b/docs/tutorial_metrics.rst
@@ -0,0 +1,418 @@
+Metrics
+=======
+
+The package `shorttext` provides a few metrics that measure the distances of some kind. They are all
+under :module:`shorttext.metrics`. The soft Jaccard score is based on spellings, and the Word Mover's
+distance (WMD) embedded word vectors.
+
+Edit Distance and Soft Jaccard Score
+------------------------------------
+
+Edit distance, or Damerau-Levenshtein distance, measures the differences
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+between two words due to insertion, deletion, transposition, substitution etc.
+Each of this change causes a distance of 1. The algorithm was written in C.
+
+First import the package:
+
+>>> from shorttext.metrics.dynprog.dldist import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score
+>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score
+
+The distance can be calculated by:
+
+>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1
+>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1
+>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1
+>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1
+
+The longest common prefix finds the length of common prefix:
+
+>>> longest_common_prefix('topology', 'topological') # gives 7
+>>> longest_common_prefix('police', 'policewoman') # gives 6
+
+The similarity between words is defined as the larger of the following:
+
+:math:`s = 1 - \frac{\text{DL distance}}{\max( \text(len(word1)), \text(len(word2)) )}`
+and
+:math:`s = \frac{\text{longest common prefix}}{\max( \text(len(word1)), \text(len(word2)) )}`
+
+>>> similarity('topology', 'topological') # gives 0.6363636363636364
+>>> similarity('book', 'blok') # gives 0.75
+
+Given the similarity, we say that the intersection, for example, between 'book' and 'blok', has 0.75 elements, or the
+union has 1.25 elements. Then the similarity between two sets of tokens can be measured using Jaccard index, with this
+"soft" numbers of intersection. Therefore,
+
+>>> soft_jaccard_score(['book', 'seller'], ['blok', 'sellers']) # gives 0.6716417910447762
+>>> soft_jaccard_score(['police', 'station'], ['policeman']) # gives 0.2857142857142858
+
+The functions `damerau_levenshtein` and `longest_common_prefix` are implemented using Cython_ .
+(Before release 0.7.2, they were interfaced to Python using SWIG_ (Simplified Wrapper and Interface Generator)).
+
+
+.. automodule:: shorttext.metrics.dynprog.jaccard
+ :members: similarity, soft_jaccard_score
+
+
+Word Mover's Distance
+---------------------
+
+Unlike soft Jaccard score that bases similarity on the words' spellings, Word Mover's distance (WMD)
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this
+the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein
+distance. The calculation of WMD in this package is based on linear programming, and the distance between
+words are the Euclidean distance by default (not cosine distance), but user can set it accordingly.
+
+Import the modules, and load the word-embedding models:
+
+>>> from shorttext.metrics.wasserstein import word_mover_distance
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+
+Examples:
+
+>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789
+>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033
+
+More examples can be found in this `IPython Notebook
+`_ .
+
+In `gensim`, the Word2Vec model allows the calculation of WMD if user installed the package PyEMD_. It is based on the
+scale invariant feature transform (SIFT), an algorithm for EMD based on L1-distance (Manhattan distance).
+For more details,
+please refer to their `tutorial
+`_ , and cite the two papers by Ofir Pele and Michael Werman
+if it is used.
+
+.. automodule:: shorttext.metrics.wasserstein.wordmoverdist
+ :members: word_mover_distance
+
+Jaccard Index Due to Cosine Distances
+-------------------------------------
+
+In the above section of edit distance, the Jaccard score was calculated by considering soft membership
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+using spelling. However, we can also compute the soft membership by cosine similarity with
+
+>>> from shorttext.utils import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/model_file.bin')
+>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents
+
+For example, the number of words between the set containing 'doctor' and that containing 'physician'
+is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is
+
+:math:`0.78060223420956831 / (2-0.78060223420956831) = 0.6401538990056869`
+
+And it can be seen by running it:
+
+>>> jaccardscore_sents('doctor', 'physician', wvmodel) # gives 0.6401538990056869
+>>> jaccardscore_sents('chief executive', 'computer cluster', wvmodel) # gives 0.0022515450768836143
+>>> jaccardscore_sents('topological data', 'data of topology', wvmodel) # gives 0.67588977344632573
+
+.. automodule:: shorttext.metrics.embedfuzzy.jaccard
+ :members:
+
+
+Reference
+---------
+
+"Damerau-Levenshtein Distance." [`Wikipedia
+`_]
+
+"Jaccard index." [`Wikipedia
+`_]
+
+Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
+`_]
+
+Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
+
+Ofir Pele, Michael Werman, "A linear time histogram metric for improved SIFT matching," *Computer Vision - ECCV 2008*, 495-508 (2008). [`ACM
+`_]
+
+Ofir Pele, Michael Werman, "Fast and robust earth mover's distances," *Proc. 2009 IEEE 12th Int. Conf. on Computer Vision*, 460-467 (2009). [`IEEE
+`_]
+
+"Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress
+`_]
+
+
+Home: :doc:`index`
+
+.. _SWIG: http://www.swig.org/
+.. _PyEMD: https://github.com/wmayner/pyemd
+.. _Cython: http://cython.org/
\ No newline at end of file
diff --git a/docs/tutorial_nnlib.rst b/docs/tutorial_nnlib.rst
index 0e2269b1..989857b3 100644
--- a/docs/tutorial_nnlib.rst
+++ b/docs/tutorial_nnlib.rst
@@ -21,7 +21,10 @@ and they are good for short text or document classification. Of course, users ca
own neural networks, written in `keras`.
A pre-trained Google Word2Vec model can be downloaded `here
-`_.
+`_,
+and a pre-trained Facebook FastText model can be downloaded `here
+`_.
+
See: :doc:`tutorial_wordembed` .
@@ -39,12 +42,16 @@ Then load the training data
Then we choose a neural network. We choose ConvNet:
->>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()))
+>>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300)
Initialize the classifier:
>>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
+.. autoclass:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification.VarNNEmbeddedVecClassifier
+ :members:
+
+
Then train the classifier:
>>> classifier.train(trainclassdict, kmodel)
@@ -82,12 +89,20 @@ To load it, enter:
>>> classifier2 = shorttext.classifiers.load_varnnlibvec_classifier(wvmodel, '/path/to/nnlibvec_convnet_subdata.bin')
+.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification
+ :members: load_varnnlibvec_classifier
+
+
Provided Neural Networks
------------------------
There are three neural networks available in this package for the use in
:class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`,
-and they are available in the module :module:`shorttext.classifiers.frameworks`.
+and they are available in the module `shorttext.classifiers.frameworks`.
+
+.. automodule:: shorttext.classifiers.embed.nnlib.frameworks
+ :members:
+
ConvNet (Convolutional Neural Network)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -97,26 +112,22 @@ as demonstrated in Kim's paper.
.. image:: images/nnlib_cnn.png
-The function in the frameworks returns a :class:`keras.models.Sequential`.
-
-.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.CNNWordEmbed
+The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are:
The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
words, then the empty words will be filled with zero vectors.
->>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys()))
+>>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
Double ConvNet
^^^^^^^^^^^^^^
-This neural network is nothing more than two ConvNet layers.
-
-.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.DoubleCNNWordEmbed
+This neural network is nothing more than two ConvNet layers. The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are:
The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
words, then the empty words will be filled with zero vectors.
->>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys()))
+>>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
C-LSTM (Convolutional Long Short-Term Memory)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -127,14 +138,12 @@ and then followed by LSTM (long short-term memory), a type of recurrent neural n
.. image:: images/nnlib_clstm.png
-The function in the frameworks returns a :class:`keras.models.Sequential`.
-
-.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.CLSTMWordEmbed
+The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`.
The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen`
words, then the empty words will be filled with zero vectors.
->>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys()))
+>>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size)
User-Defined Neural Network
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -149,6 +158,16 @@ of the embedded vectors. The output is a one-dimensional array, of size equal to
the number of classes provided by the training data. The order of the class labels is assumed
to be the same as the order of the given training data (stored as a Python dictionary).
+Putting Word2Vec Model As an Input Keras Layer (Deprecated)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This functionality is removed since release 0.5.11, due to the following reasons:
+
+* `keras` changed its code that produces this bug;
+* the layer is consuming memory;
+* only Word2Vec is supported; and
+* the results are incorrect.
+
Reference
---------
@@ -158,6 +177,9 @@ Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network
"CS231n Convolutional Neural Networks for Visual Recognition," Stanford Online Course. [`link
`_]
+Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv
+`_]
+
Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium
`_]
diff --git a/docs/tutorial_spell.rst b/docs/tutorial_spell.rst
new file mode 100644
index 00000000..e6baefd7
--- /dev/null
+++ b/docs/tutorial_spell.rst
@@ -0,0 +1,38 @@
+Spell Correctors
+================
+
+This package supports the use of spell correctors, because typos are very common in relatively short text data.
+
+There are two types of spell correctors provided: the one described by Peter Norvig (using n-grams Bayesian method),
+and another by Keisuke Sakaguchi and his colleagues (using semi-character level recurrent neural network).
+
+>>> import shorttext
+
+We use the Norvig's training corpus as an example. To load it,
+
+>>> from urllib.request import urlopen
+>>> text = urlopen('https://norvig.com/big.txt').read()
+
+The developer just has to instantiate the spell corrector, and then train it with a corpus to get a correction model.
+Then one can use it for correction.
+
+Norvig
+------
+
+Peter Norvig described a spell corrector based on Bayesian approach and edit distance. You can refer to his blog for
+more information.
+
+>>> norvig_corrector = shorttext.spell.NorvigSpellCorrector()
+>>> norvig_corrector.train(text)
+>>> norvig_corrector.correct('oranhe') # gives "orange"
+
+.. automodule:: shorttext.spell.norvig
+ :members:
+
+
+
+Reference
+---------
+
+Peter Norvig, "How to write a spell corrector." (2016) [`Norvig
+`_]
diff --git a/docs/tutorial_stacking.rst b/docs/tutorial_stacking.rst
index 7fd67185..037ccd0c 100644
--- a/docs/tutorial_stacking.rst
+++ b/docs/tutorial_stacking.rst
@@ -77,6 +77,11 @@ offered in this package. To load them, initialize it in the same way:
>>> stacker2 = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier})
>>> stacker2.load_compact_model('/path/to/logitmodel.bin')
+
+.. automodule:: shorttext.stack.stacking
+ :members:
+
+
Reference
---------
diff --git a/docs/tutorial_sumvec.rst b/docs/tutorial_sumvec.rst
index de62a725..c5ffaf70 100644
--- a/docs/tutorial_sumvec.rst
+++ b/docs/tutorial_sumvec.rst
@@ -23,6 +23,318 @@ Import the package:
To load the Word2Vec model,
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
+>>> from shorttext import load_word2vec_model
+>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+Then we load a set of data:
+
+>>> nihtraindata = shorttext.data.nihreports(sample_size=None)
+
+Then initialize the classifier:
+
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
+>>> classifier.train(nihtraindata)
+
+This classifier takes relatively little time to train compared with others
+in this package. Then we can perform classification:
+
+>>> classifier.score('bioinformatics')
+
+Or the result can be sorted and only the five top-scored results are displayed:
+
+>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NIGMS', 0.44962596182682935),
+ ('NIAID', 0.4494126990050461),
+ ('NINDS', 0.43435236806719524),
+ ('NIDCR', 0.43042338197002483),
+ ('NHGRI', 0.42878346869968731)]
+>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NHGRI', 0.54200061864847038),
+ ('NCATS', 0.49097267547279988),
+ ('NIGMS', 0.47818129591411118),
+ ('CIT', 0.46874987052158501),
+ ('NLM', 0.46869259072562974)]
+>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]
+[('NCI', 0.53734097785976076),
+ ('NIAID', 0.50616582142027433),
+ ('NIDCR', 0.48596330887674788),
+ ('NIDDK', 0.46875755765903215),
+ ('NCCAM', 0.4642233792198418)]
+
+The trained model can be saved:
+
+>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin')
+
+And with the same pre-trained Word2Vec model, this classifier can be loaded:
+
+>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+
+Import the package:
+
+>>> import shorttext
+
+To load the Word2Vec model,
+
>>> from shorttext.utils import load_word2vec_model
>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
@@ -32,7 +344,7 @@ Then we load a set of data:
Then initialize the classifier:
->>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel)
+>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100)
>>> classifier.train(nihtraindata)
This classifier takes relatively little time to train compared with others
@@ -69,6 +381,10 @@ And with the same pre-trained Word2Vec model, this classifier can be loaded:
>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin')
+.. autoclass:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification.SumEmbeddedVecClassifier
+ :members:
+
+
Appendix: Model I/O in Previous Versions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/tutorial_textpreprocessing.rst b/docs/tutorial_textpreprocessing.rst
index df294fa4..e414c56b 100644
--- a/docs/tutorial_textpreprocessing.rst
+++ b/docs/tutorial_textpreprocessing.rst
@@ -15,7 +15,121 @@ following steps:
- removing numerals,
- converting all alphabets to lower cases,
- removing stop words, and
-- stemming the words (using Porter stemmer).
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
+
+To do this, load the preprocesser generator:
+
+>>> from shorttext import standard_text_preprocessor_1
+
+Then define the preprocessor, a function, by just calling:
+
+>>> preprocessor1 = standard_text_preprocessor_1()
+specify how the text is preprocessed before it is passed to the trainers or the
+classifiers.
+
+This package provides a standard way of text preprocessing, which goes through the
+following steps:
+
+- removing special characters,
+- removing numerals,
+- converting all alphabets to lower cases,
+- removing stop words, and
+- stemming the words (using Snowball Porter stemmer).
To do this, load the preprocesser generator:
@@ -25,6 +139,9 @@ Then define the preprocessor, a function, by just calling:
>>> preprocessor1 = standard_text_preprocessor_1()
+.. automodule:: shorttext.utils.textpreprocessing
+ :members: standard_text_preprocessor_1
+
It is a function that perform the preprocessing in the steps above:
>>> preprocessor1('Maryland Blue Crab') # output: 'maryland blue crab'
@@ -45,6 +162,90 @@ let's develop a preprocessor that 1) convert it to base form if it is a verb, or
Load the function that generates the preprocessor function:
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
+>>> from shorttext import text_preprocessor
+
+Initialize a WordNet lemmatizer using
+but some users may want to define their own preprocessors for their own purposes.
+This preprocessor is used in topic modeling, and is desired to be *a function that takes
+a string, and returns a string*.
+
+If the user wants to develop a preprocessor that contains a few steps, he can make it by providing
+the pipeline, which is a list of functions that input a string and return a string. For example,
+let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original;
+2) convert it to upper case; and 3) tag the number of characters after each token.
+
+Load the function that generates the preprocessor function:
+
>>> from shorttext.utils import text_preprocessor
Initialize a WordNet lemmatizer using `nltk`:
@@ -54,9 +255,9 @@ Initialize a WordNet lemmatizer using `nltk`:
Define the pipeline. Functions for each of the steps are:
->>> step1fcn = lambda s: ' '.join(map(lambda s1: lemmatizer.lemmatize(s1), s.split(' ')))
+>>> step1fcn = lambda s: ' '.join([lemmatizer.lemmatize(s1) for s1 in s.split(' ')])
>>> step2fcn = lambda s: s.upper()
->>> step3fcn = lambda s: ' '.join(map(lambda s1: s1+'-'+str(len(s1)), s.split(' ')))
+>>> step3fcn = lambda s: ' '.join([s1+'-'+str(len(s1)) for s1 in s.split(' ')])
Then the pipeline is:
@@ -72,11 +273,14 @@ Some examples are:
>>> preprocessor2('Maryland blue crab in Annapolis') # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9'
>>> preprocessor2('generative adversarial networks') # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7'
+.. automodule:: shorttext.utils.textpreprocessing
+ :members: text_preprocessor
+
Tokenization
------------
Users are free to choose any tokenizer they wish. In `shorttext`, the tokenizer is
-implemented with `spaCy`, and can be called:
+simply the space delimiter, and can be called:
>>> shorttext.utils.tokenize('Maryland blue crab') # output: ['Maryland', 'blue', 'crab']
diff --git a/docs/tutorial_topic.rst b/docs/tutorial_topic.rst
index 97c21def..eab97915 100644
--- a/docs/tutorial_topic.rst
+++ b/docs/tutorial_topic.rst
@@ -71,7 +71,7 @@ While initialize the instance of the topic modeler, the user can also specify
whether to weigh the terms using tf-idf (term frequency - inverse document frequency).
The default is to weigh. To not weigh, initialize it as
->>> topicmodeler3 = shorttext.generators.GensimTopicModeler(toweight=False)
+>>> topicmodeler3 = shorttext.generators.GensimTopicModeler(toweigh=False)
Appendix: Model I/O in Previous Versions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -88,6 +88,11 @@ not saved. To load the model, enter:
>>> topicmodeler2 = shorttext.classifiers.load_gensimtopicmodel('/path/to/nihlda128', compact=False)
+
+.. automodule:: shorttext.generators.bow.GensimTopicModeling
+ :members:
+
+
AutoEncoder
-----------
@@ -135,7 +140,12 @@ Like other topic models, while initialize the instance of the topic modeler, the
whether to weigh the terms using tf-idf (term frequency - inverse document frequency).
The default is to weigh. To not weigh, initialize it as:
->>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweight=False)
+>>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweigh=False)
+
+
+.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling
+ :members:
+
Appendix: Unzipping Model I/O
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -178,6 +188,12 @@ an abstract class virtually. If user wants to develop its own topic model that e
this, he has to define the methods `train`, `retrieve_topic_vec`, `loadmodel`, and
`savemodel`.
+.. automodule:: shorttext.generators.bow.LatentTopicModeling
+ :members:
+
+.. automodule:: shorttext.generators.bow.GensimTopicModeling
+ :members:
+
Appendix: Namespaces for Topic Modeler in Previous Versions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -197,8 +213,9 @@ A list include:
shorttext.classifiers.load_autoencoder_topic -> shorttext.generators.load_autoencoder_topicmodel
-For backward compatibility, developers can still call the topic models as if there were no such changes,
-although they are advised to make this change.
+Before release 0.5.6, for backward compatibility, developers can still call the topic models as if there were no such changes,
+although they are advised to make this change. However, *effective release 0.5.7, this backward compatibility is no longer
+available.*
Classification Using Cosine Similarity
--------------------------------------
@@ -231,6 +248,10 @@ The same thing for autoencoder, but the classifier based on autoencoder can be l
>>> cos_classifier = shorttext.classifiers.load_autoencoder_cosineClassifier('/path/to/sub_autoencoder8.bin')
+.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification
+ :members:
+
+
Classification Using Scikit-Learn Classifiers
---------------------------------------------
@@ -239,7 +260,7 @@ algorithms. We can take any supervised learning algorithms in `scikit-learn` her
We use Gaussian naive Bayes as an example. For faster demonstration, use the subject
keywords as the example dataset.
->>> subtopicmodeler = shorttext.generators.GensimTopicModeler()
+>>> subtopicmodeler = shorttext.generators.LDAModeler()
>>> subtopicmodeler.train(subdict, 8)
We first import the class:
@@ -271,6 +292,13 @@ will still do the work. However, to load the saved classifier with an autoencode
>>> classifier2 = shorttext.classifiers.load_autoencoder_topic_sklearnclassifier('/path/to/filename.bin')
+Compact model files saved by `TopicVectorSkLearnClassifier` in `shorttext` >= 1.0.0 cannot be read
+by earlier version of `shorttext`; vice versa is not true though: old compact model files can be read in.
+
+.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification
+ :members:
+
+
Notes about Text Preprocessing
------------------------------
diff --git a/docs/tutorial_wordembed.rst b/docs/tutorial_wordembed.rst
index 1b719c74..692b3d74 100644
--- a/docs/tutorial_wordembed.rst
+++ b/docs/tutorial_wordembed.rst
@@ -10,13 +10,84 @@ their page. To load the model, call:
>>> import shorttext
>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
-It is a binary file, and the default is set to be `binary=True`. In fact, it is equivalent to calling,
-if you have `gensim` version before 1.0.0:
+It is a binary file, and the default is set to be
+their page. To load the model, call:
->>> import gensim
->>> wvmodel = gensim.models.Word2Vec.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True)
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
-Or beyond version 1.0.0,
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be
+their page. To load the model, call:
+
+>>> import shorttext
+>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz')
+
+It is a binary file, and the default is set to be `binary=True`.
+
+.. automodule:: shorttext.utils.wordembed
+ :members: load_word2vec_model
+
+It is equivalent to calling,
>>> import gensim
>>> wvmodel = gensim.models.KeyedVectors.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True)
@@ -72,18 +143,69 @@ One can convert a text-format GloVe model into a text-format Word2Vec model. Mor
in the documentation of `gensim`: `Converting GloVe to Word2Vec
`_
+FastText
+--------
+
+FastText is a similar word-embedding model from Facebook. You can download pre-trained models here:
+
+`Pre-trained word vectors
+`_
+
+To load a pre-trained FastText model, run:
+
+>>> import shorttext
+>>> ftmodel = shorttext.utils.load_fasttext_model('/path/to/model.bin')
+
+And it is used exactly the same way as Word2Vec.
+
+.. automodule:: shorttext.utils.wordembed
+ :members: load_fasttext_model
+
+Poincaré Embeddings
+-------------------
+
+Poincaré embeddings is a new embedding that learns both semantic similarity and hierarchical structures. To load a
+pre-trained model, run:
+
+>>> import shorttext
+>>> pemodel = shorttext.utils.load_poincare_model('/path/to/model.txt')
+
+For preloaded word-embedding models, please refer to :doc:`tutorial_wordembed`.
+
+.. automodule:: shorttext.utils.wordembed
+ :members: load_poincare_model
+
+
+
+Other Functions
+---------------
+
+.. automodule:: shorttext.utils.wordembed
+ :members: shorttext_to_avgvec
+
+
Links
-----
- Word2Vec_
- GloVe_
+- FastText_
Reference
---------
+Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe
+`_]
+
Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF
`_]
+Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv
+`_]
+
+Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv
+`_]
+
Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv
`_]
@@ -102,4 +224,5 @@ Radim Řehůřek, "Making sense of word2vec," RaRe Technologies (2014). [`RaRe
Home: :doc:`index`
.. _Word2Vec: https://code.google.com/archive/p/word2vec/
-.. _GloVe: http://nlp.stanford.edu/projects/glove/
\ No newline at end of file
+.. _GloVe: http://nlp.stanford.edu/projects/glove/
+.. _FastText: https://github.com/facebookresearch/fastText
diff --git a/examples/sakaguchi_spell/binarize.py b/examples/sakaguchi_spell/binarize.py
new file mode 100644
index 00000000..49b1fc02
--- /dev/null
+++ b/examples/sakaguchi_spell/binarize.py
@@ -0,0 +1,164 @@
+
+import re
+import string
+from functools import reduce
+
+import numpy as np
+from shorttext.generators.charbase.char2vec import initSentenceToCharVecEncoder
+from shorttext.utils import OperationNotDefinedException
+
+
+default_alph = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#"
+# NB. # is , _ is , @ is number
+default_specialsignals = {'eos': '#', 'unk': '_', 'number': '@'}
+default_signaldenotions = {'': 'eos', '': 'unk'}
+
+
+class SpellingToConcatCharVecEncoder:
+ def __init__(self, alph):
+ self.charevec_encoder = initSentenceToCharVecEncoder(alph)
+
+ def encode_spelling(self, spelling):
+ spmat = self.charevec_encoder.encode_sentence(spelling, len(spelling))
+ return spmat.sum(axis=0)
+
+ def __len__(self):
+ return len(self.charevec_encoder)
+
+
+def hasnum(word):
+ return len(re.findall('\\d', word)) > 0
+
+
+class SCRNNBinarizer:
+ """ A class used by Sakaguchi's spell corrector to convert text into numerical vectors.
+
+ No documentation for this class.
+
+ """
+ def __init__(self, alpha, signalchar_dict):
+ self.signalchar_dict = signalchar_dict
+ self.concatchar_encoder = SpellingToConcatCharVecEncoder(alpha)
+ self.char_dict = self.concatchar_encoder.charevec_encoder.dictionary
+
+ def noise_char(self, word, opt, unchanged=False):
+ bin_all = np.zeros((len(self.concatchar_encoder), 1))
+ w = word
+ if word in default_signaldenotions.keys():
+ bin_all[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ elif hasnum(word):
+ bin_all[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ elif unchanged:
+ bin_all = self.concatchar_encoder.encode_spelling(w).transpose()
+ elif opt=='DELETE':
+ if len(word) > 1:
+ idx = np.random.randint(0, len(word))
+ w = word[:idx] + word[(idx+1):]
+ else:
+ w = word
+ bin_all = self.concatchar_encoder.encode_spelling(w).transpose()
+ elif opt=='INSERT':
+ ins_idx = np.random.randint(0, len(word)+1)
+ ins_char = np.random.choice([c for c in string.ascii_lowercase])
+ w = word[:ins_idx] + ins_char + word[ins_idx:]
+ bin_all = self.concatchar_encoder.encode_spelling(w).transpose()
+ elif opt=='REPLACE':
+ rep_idx = np.random.randint(0, len(word))
+ rep_char = np.random.choice([c for c in string.ascii_lowercase])
+ w = word[:rep_idx] + rep_char + w[(rep_idx+1):]
+ bin_all = self.concatchar_encoder.encode_spelling(w).transpose()
+ else:
+ raise OperationNotDefinedException('NOISE-'+opt)
+ return np.array([ np.repeat(np.array([bin_all]), 3, axis=0).reshape((1, len(self.concatchar_encoder)*3))[0] ]).transpose(), w
+
+ def jumble_char(self, word, opt, unchanged=False):
+ if opt=='WHOLE':
+ return self.jumble_char_whole(word, unchanged=unchanged)
+ elif opt=='BEG':
+ return self.jumble_char_beg(word, unchanged=unchanged)
+ elif opt=='END':
+ return self.jumble_char_end(word, unchanged=unchanged)
+ elif opt=='INT':
+ return self.jumble_char_int(word, unchanged=unchanged)
+ else:
+ raise OperationNotDefinedException('JUMBLE-'+opt)
+
+ def jumble_char_whole(self, word, unchanged=False):
+ bin_all = np.zeros((len(self.concatchar_encoder), 1))
+ w = word
+ if word in default_signaldenotions.keys():
+ bin_all[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ elif hasnum(word):
+ bin_all[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ else:
+ w = ''.join(np.random.choice([c for c in word], len(word), replace=False)) if not unchanged else word
+ bin_all = self.concatchar_encoder.encode_spelling(w).transpose()
+ bin_filler = np.zeros((len(self.concatchar_encoder)*2, 1))
+ return np.concatenate((bin_all, bin_filler), axis=0), w
+
+ def jumble_char_beg(self, word, unchanged=False):
+ bin_initial = np.zeros((len(self.concatchar_encoder), 1))
+ bin_end = np.zeros((len(self.concatchar_encoder), 1))
+ bin_filler = np.zeros((len(self.concatchar_encoder), 1))
+ w = word
+ if word in default_signaldenotions.keys():
+ bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ elif hasnum(word):
+ bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ else:
+ w_init = ''.join(np.random.choice([c for c in word[:-1]], len(word)-1)) if not unchanged and len(w)>3 else word[:-1]
+ w = w_init + word[-1]
+ if len(w_init) > 0:
+ bin_initial = self.concatchar_encoder.encode_spelling(w_init).transpose()
+ bin_end = self.concatchar_encoder.encode_spelling(word[-1]).transpose()
+ return reduce(lambda a, b: np.concatenate((a, b), axis=0), [bin_initial, bin_end, bin_filler]), w
+
+ def jumble_char_end(self, word, unchanged=False):
+ bin_initial = np.zeros((len(self.concatchar_encoder), 1))
+ bin_end = np.zeros((len(self.concatchar_encoder), 1))
+ bin_filler = np.zeros((len(self.concatchar_encoder), 1))
+ w = word
+ if word in default_signaldenotions.keys():
+ bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ elif hasnum(word):
+ bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ else:
+ w_end = ''.join(np.random.choice([c for c in word[1:]], len(word)-1)) if not unchanged and len(w)>3 else word[1:]
+ w = word[0] + w_end
+ bin_initial = self.concatchar_encoder.encode_spelling(word[0]).transpose()
+ if len(w_end) > 0:
+ bin_end = self.concatchar_encoder.encode_spelling(w_end).transpose()
+ return reduce(lambda a, b: np.concatenate((a, b), axis=0), [bin_initial, bin_end, bin_filler]), w
+
+ def jumble_char_int(self, word, unchanged=False):
+ bin_initial = np.zeros((len(self.concatchar_encoder), 1))
+ bin_middle = np.zeros((len(self.concatchar_encoder), 1))
+ bin_end = np.zeros((len(self.concatchar_encoder), 1))
+ w = word
+ if word in default_signaldenotions.keys():
+ bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ bin_middle[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1
+ elif hasnum(word):
+ bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ bin_middle[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1
+ else:
+ w_mid = ''.join(np.random.choice([c for c in word[1:-1]], len(word)-2)) if not unchanged and len(w)>3 else w[1:-1]
+ w = word[0] + w_mid + word[-1]
+ bin_initial = self.concatchar_encoder.encode_spelling(word[0]).transpose()
+ if len(w_mid)>0:
+ bin_middle = self.concatchar_encoder.encode_spelling(w_mid).transpose()
+ bin_end = self.concatchar_encoder.encode_spelling(word[-1]).transpose()
+ return reduce(lambda a, b: np.append(a, b, axis=0), [bin_initial, bin_middle, bin_end]), w
+
+ def change_nothing(self, word, operation):
+ if operation.upper().startswith('NOISE'):
+ return self.noise_char(word, operation[6:], unchanged=True)
+ else:
+ return self.jumble_char(word, operation[7:], unchanged=True)
+
diff --git a/examples/sakaguchi_spell/sakaguchi.py b/examples/sakaguchi_spell/sakaguchi.py
new file mode 100644
index 00000000..3d29e6c9
--- /dev/null
+++ b/examples/sakaguchi_spell/sakaguchi.py
@@ -0,0 +1,202 @@
+
+# Reference: https://github.com/keisks/robsut-wrod-reocginiton
+# Article: http://cs.jhu.edu/~kevinduh/papers/sakaguchi17robsut.pdf
+
+import json
+
+import numpy as np
+from gensim.corpora import Dictionary
+from sklearn.preprocessing import OneHotEncoder
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, TimeDistributed
+
+import shorttext.utils.kerasmodel_io as kerasio
+from shorttext.spell import SpellCorrector
+from .binarize import default_alph, default_specialsignals
+from shorttext.utils import classification_exceptions as ce
+from .binarize import SpellingToConcatCharVecEncoder, SCRNNBinarizer
+from shorttext.utils import CompactIOMachine
+
+
+nospace_tokenize = lambda sentence: [t.strip() for t in sentence.split() if len(t.strip())>0]
+
+
+class SCRNNSpellCorrector(SpellCorrector, CompactIOMachine):
+ """ scRNN (semi-character-level recurrent neural network) Spell Corrector.
+
+ Reference:
+ Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv
+ `_]
+
+ """
+ def __init__(self, operation,
+ alph=default_alph,
+ specialsignals=default_specialsignals,
+ concatcharvec_encoder=None,
+ batchsize=1,
+ nb_hiddenunits=650):
+ """ Instantiate the scRNN spell corrector.
+
+ :param operation: types of distortion of words in training (options: "NOISE-INSERT", "NOISE-DELETE", "NOISE-REPLACE", "JUMBLE-WHOLE", "JUMBLE-BEG", "JUMBLE-END", and "JUMBLE-INT")
+ :param alph: default string of characters (Default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#")
+ :param specialsignals: dictionary of special signals (Default built-in)
+ :param concatcharvec_encoder: one-hot encoder for characters, initialize if None. (Default: None)
+ :param batchsize: batch size. (Default: 1)
+ :param nb_hiddenunits: number of hidden units. (Default: 650)
+ :type operation: str
+ :type alpha: str
+ :type specialsignals: dict
+ :type concatcharvec_encoder: shorttext.spell.binarize.SpellingToConcatCharVecEncoder
+ :type batchsize: int
+ :type nb_hiddenunits: int
+ """
+ CompactIOMachine.__init__(self, {'classifier': 'scrnn_spell'}, 'scrnn_spell', ['_config.json', '_vocabs.gensimdict', '.weights.h5', '.json'])
+ self.operation = operation
+ self.alph = alph
+ self.specialsignals = specialsignals
+ self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
+ self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph) if concatcharvec_encoder==None else concatcharvec_encoder
+ self.onehotencoder = OneHotEncoder()
+ self.trained = False
+ self.batchsize = batchsize
+ self.nb_hiddenunits = nb_hiddenunits
+
+ def preprocess_text_train(self, text):
+ """ A generator that output numpy vectors for the text for training.
+
+ :param text: text
+ :return: generator that outputs the numpy vectors for training
+ :type text: str
+ :rtype: generator
+ """
+ for token in nospace_tokenize(text):
+ if self.operation.upper().startswith('NOISE'):
+ xvec, _ = self.binarizer.noise_char(token, self.operation.upper()[6:])
+ elif self.operation.upper().startswith('JUMBLE'):
+ xvec, _ = self.binarizer.jumble_char(token, self.operation.upper()[7:])
+ normtoken = token if token in self.dictionary.token2id else ''
+ yvec = self.onehotencoder.transform([[self.dictionary.token2id[normtoken]]]).toarray().reshape((len(self.dictionary), 1))
+ yield xvec, yvec
+
+ def preprocess_text_correct(self, text):
+ """ A generator that output numpy vectors for the text for correction.
+
+ ModelNotTrainedException is raised if the model has not been trained.
+
+ :param text: text
+ :return: generator that outputs the numpy vectors for correction
+ :type text: str
+ :rtype: generator
+ :raise: ModelNotTrainedException
+ """
+ if not self.trained:
+ raise ce.ModelNotTrainedException()
+ for token in nospace_tokenize(text):
+ xvec, _ = self.binarizer.change_nothing(token, self.operation)
+ yield xvec
+
+ def train(self, text, nb_epoch=100, dropout_rate=0.01, optimizer='rmsprop'):
+ """ Train the scRNN model.
+
+ :param text: training corpus
+ :param nb_epoch: number of epochs (Default: 100)
+ :param dropout_rate: dropout rate (Default: 0.01)
+ :param optimizer: optimizer (Default: "rmsprop")
+ :type text: str
+ :type nb_epoch: int
+ :type dropout_rate: float
+ :type optimizer: str
+ """
+ self.dictionary = Dictionary([nospace_tokenize(text), default_specialsignals.values()])
+ self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
+ xylist = [(xvec.transpose(), yvec.transpose()) for xvec, yvec in self.preprocess_text_train(text)]
+ xtrain = np.array([item[0] for item in xylist])
+ ytrain = np.array([item[1] for item in xylist])
+
+ # neural network here
+ model = Sequential()
+ model.add(LSTM(self.nb_hiddenunits, return_sequences=True))
+ model.add(Dropout(dropout_rate))
+ model.add(TimeDistributed(Dense(len(self.dictionary))))
+ model.add(Activation('softmax'))
+
+ # compile... more arguments
+ model.compile(loss='categorical_crossentropy', optimizer=optimizer)
+
+ # training
+ model.fit(xtrain, ytrain, epochs=nb_epoch)
+
+ self.model = model
+ self.trained = True
+
+ def correct(self, word):
+ """ Recommend a spell correction to given the word.
+
+ :param word: a given word
+ :return: recommended correction
+ :type word: str
+ :rtype: str
+ :raise: ModelNotTrainedException
+ """
+ if not self.trained:
+ raise ce.ModelNotTrainedException()
+
+ xmat = np.array([xvec.transpose() for xvec in self.preprocess_text_correct(word)])
+ yvec = self.model.predict(xmat)
+
+ maxy = yvec.argmax(axis=-1)
+ return ' '.join([self.dictionary[y] for y in maxy[0]])
+
+ def loadmodel(self, prefix):
+ """ Load the model.
+
+ :param prefix: prefix of the model path
+ :return: None
+ :type prefix: str
+ """
+ self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict')
+ parameters = json.load(open(prefix+'_config.json', 'r'))
+ self.operation = parameters['operation']
+ self.alph = parameters['alph']
+ self.specialsignals = parameters['special_signals']
+ self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
+ self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph)
+ self.batchsize = parameters['batchsize']
+ self.nb_hiddenunits = parameters['nb_hiddenunits']
+ self.onehotencoder = OneHotEncoder()
+ self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
+ self.model = kerasio.load_model(prefix)
+ self.trained = True
+
+ def savemodel(self, prefix):
+ """ Save the model.
+
+ :param prefix: prefix of the model path
+ :return: None
+ :type prefix: str
+ """
+ if not self.trained:
+ raise ce.ModelNotTrainedException()
+ kerasio.save_model(prefix, self.model)
+ self.dictionary.save(prefix+'_vocabs.gensimdict')
+ parameters = {'alph': self.alph, 'special_signals': self.specialsignals, 'operation': self.operation,
+ 'batchsize': self.batchsize, 'nb_hiddenunits': self.nb_hiddenunits}
+ json.dump(parameters, open(prefix+'_config.json', 'w'))
+
+
+def loadSCRNNSpellCorrector(filepath, compact=True):
+ """ Load a pre-trained scRNN spell corrector instance.
+
+ :param filepath: path of the model if compact==True; prefix of the model oath if compact==False
+ :param compact: whether model file is compact (Default: True)
+ :return: an instance of scRnn spell corrector
+ :type filepath: str
+ :type compact: bool
+ :rtype: SCRNNSpellCorrector
+ """
+ corrector = SCRNNSpellCorrector('JUMBLE-WHOLE')
+ if compact:
+ corrector.load_compact_model(filepath)
+ else:
+ corrector.loadmodel(filepath)
+ return corrector
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..aa08e578
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,85 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "shorttext"
+version = "3.0.1"
+authors = [
+ {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"}
+]
+description = "Short Text Mining"
+readme = {file = "README.md", content-type = "text/markdown"}
+license = {text = "MIT"}
+keywords = ["shorttext", "natural language processing", "text mining"]
+requires-python = ">=3.9"
+classifiers = [
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Scientific/Engineering :: Mathematics",
+ "Topic :: Text Processing :: Linguistic",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Natural Language :: English",
+ "License :: OSI Approved :: MIT License",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Education",
+ "Intended Audience :: Information Technology",
+ "Intended Audience :: Science/Research"
+]
+dependencies = [
+ "numpy>=1.23.3",
+ "scipy>=1.12.0",
+ "joblib>=1.3.0",
+ "scikit-learn>=1.2.0",
+ "tensorflow>=2.13.0",
+ "keras>=2.13.0",
+ "gensim>=4.0.0",
+ "pandas>=1.2.0",
+ "snowballstemmer>=3.0.0",
+ "numba>=0.57.0",
+ "deprecation>=2.0.0",
+ "npdict>=0.0.5",
+ "nptyping>=2.0.0"
+]
+
+[project.urls]
+Repository = "https://github.com/stephenhky/PyShortTextCategorization"
+Issues = "https://github.com/stephenhky/PyShortTextCategorization/issues"
+Documentation = "https://shorttext.readthedocs.io"
+
+[tool.setuptools]
+packages = [
+ "shorttext",
+ "shorttext.cli",
+ "shorttext.utils",
+ "shorttext.classifiers",
+ "shorttext.classifiers.embed",
+ "shorttext.classifiers.embed.nnlib",
+ "shorttext.classifiers.embed.sumvec",
+ "shorttext.classifiers.bow",
+ "shorttext.classifiers.bow.topic",
+ "shorttext.classifiers.bow.maxent",
+ "shorttext.data",
+ "shorttext.stack",
+ "shorttext.generators",
+ "shorttext.generators.bow",
+ "shorttext.generators.charbase",
+ "shorttext.generators.seq2seq",
+ "shorttext.metrics",
+ "shorttext.metrics.dynprog",
+ "shorttext.metrics.wasserstein",
+ "shorttext.metrics.embedfuzzy",
+ "shorttext.spell"
+]
+zip-safe = false
+package-dir = {"" = "src"}
+
+[project.scripts]
+ShortTextCategorizerConsole = "shorttext.cli.categorization:main"
+ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main"
+
+[project.optional-dependencies]
+test = ["unittest2", "pytest"]
diff --git a/readthedocs/Makefile b/readthedocs/Makefile
deleted file mode 100644
index 109df405..00000000
--- a/readthedocs/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS =
-SPHINXBUILD = sphinx-build
-PAPER =
-BUILDDIR = build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4 = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
- @echo "Please use \`make ' where is one of"
- @echo " html to make standalone HTML files"
- @echo " dirhtml to make HTML files named index.html in directories"
- @echo " singlehtml to make a single large HTML file"
- @echo " pickle to make pickle files"
- @echo " json to make JSON files"
- @echo " htmlhelp to make HTML files and a HTML help project"
- @echo " qthelp to make HTML files and a qthelp project"
- @echo " applehelp to make an Apple Help Book"
- @echo " devhelp to make HTML files and a Devhelp project"
- @echo " epub to make an epub"
- @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
- @echo " latexpdf to make LaTeX files and run them through pdflatex"
- @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
- @echo " text to make text files"
- @echo " man to make manual pages"
- @echo " texinfo to make Texinfo files"
- @echo " info to make Texinfo files and run them through makeinfo"
- @echo " gettext to make PO message catalogs"
- @echo " changes to make an overview of all changed/added/deprecated items"
- @echo " xml to make Docutils-native XML files"
- @echo " pseudoxml to make pseudoxml-XML files for display purposes"
- @echo " linkcheck to check all external links for integrity"
- @echo " doctest to run all doctests embedded in the documentation (if enabled)"
- @echo " coverage to run coverage check of the documentation (if enabled)"
-
-clean:
- rm -rf $(BUILDDIR)/*
-
-html:
- $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
- @echo
- @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
- $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
- @echo
- @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
- $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
- @echo
- @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
- $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
- @echo
- @echo "Build finished; now you can process the pickle files."
-
-json:
- $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
- @echo
- @echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
- $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
- @echo
- @echo "Build finished; now you can run HTML Help Workshop with the" \
- ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
- $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
- @echo
- @echo "Build finished; now you can run "qcollectiongenerator" with the" \
- ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
- @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/shorttext.qhcp"
- @echo "To view the help file:"
- @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/shorttext.qhc"
-
-applehelp:
- $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
- @echo
- @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
- @echo "N.B. You won't be able to view it unless you put it in" \
- "~/Library/Documentation/Help or install it in your application" \
- "bundle."
-
-devhelp:
- $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
- @echo
- @echo "Build finished."
- @echo "To view the help file:"
- @echo "# mkdir -p $$HOME/.local/share/devhelp/shorttext"
- @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/shorttext"
- @echo "# devhelp"
-
-epub:
- $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
- @echo
- @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo
- @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
- @echo "Run \`make' in that directory to run these through (pdf)latex" \
- "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo "Running LaTeX files through pdflatex..."
- $(MAKE) -C $(BUILDDIR)/latex all-pdf
- @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo "Running LaTeX files through platex and dvipdfmx..."
- $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
- @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
- $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
- @echo
- @echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
- $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
- @echo
- @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
- $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
- @echo
- @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
- @echo "Run \`make' in that directory to run these through makeinfo" \
- "(use \`make info' here to do that automatically)."
-
-info:
- $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
- @echo "Running Texinfo files through makeinfo..."
- make -C $(BUILDDIR)/texinfo info
- @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
- $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
- @echo
- @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
- $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
- @echo
- @echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
- $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
- @echo
- @echo "Link check complete; look for any errors in the above output " \
- "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
- $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
- @echo "Testing of doctests in the sources finished, look at the " \
- "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
- $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
- @echo "Testing of coverage in the sources finished, look at the " \
- "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
- $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
- @echo
- @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
- $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
- @echo
- @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/readthedocs/source/conf.py b/readthedocs/source/conf.py
deleted file mode 100644
index 001d7f0c..00000000
--- a/readthedocs/source/conf.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# shorttext documentation build configuration file, created by
-# sphinx-quickstart on Sun Dec 11 16:15:57 2016.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys
-import os
-import shlex
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.mathjax',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'shorttext'
-copyright = u'2017, Kwan-Yuet Ho'
-author = u'Kwan-Yuet Ho'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.3'
-# The full version, including alpha/beta/rc tags.
-release = '0.3.8'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = []
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-html_theme = 'alabaster'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further. For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents. If None, it defaults to
-# " v documentation".
-#html_title = None
-
-# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-#html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a tag referring to it. The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Language to be used for generating the HTML full-text search index.
-# Sphinx supports the following languages:
-# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
-# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
-#html_search_language = 'en'
-
-# A dictionary with options for the search language support, empty by default.
-# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
-
-# The name of a javascript file (relative to the configuration directory) that
-# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'shorttextdoc'
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-# author, documentclass [howto, manual, or own class]).
-latex_documents = [
- (master_doc, 'shorttext.tex', u'shorttext Documentation',
- u'Kwan-Yuet Ho', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
- (master_doc, 'shorttext', u'shorttext Documentation',
- [author], 1)
-]
-
-# If true, show URL addresses after external links.
-#man_show_urls = False
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-# dir menu entry, description, category)
-texinfo_documents = [
- (master_doc, 'shorttext', u'shorttext Documentation',
- author, 'shorttext', 'One line description of project.',
- 'Miscellaneous'),
-]
-
-# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
-
-# If false, no module index is generated.
-#texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
diff --git a/readthedocs/source/doclink.rst b/readthedocs/source/doclink.rst
deleted file mode 100644
index 273c08cc..00000000
--- a/readthedocs/source/doclink.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Documentation
-=============
-
-Go to: PythonHosted_
-
-.. _PythonHosted: http://pythonhosted.org/shorttext/
\ No newline at end of file
diff --git a/readthedocs/source/index.rst b/readthedocs/source/index.rst
deleted file mode 100644
index 2e53fe08..00000000
--- a/readthedocs/source/index.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-.. shorttext documentation master file, created by
- sphinx-quickstart on Sun Dec 11 16:15:57 2016.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to shorttext's documentation!
-=====================================
-
-This repository is a collection of algorithms for multi-class classification to short texts using Python.
-Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report
-issues through the Github_ page.
-
-Contents:
-
-.. toctree::
- :maxdepth: 2
-
- install
- doclink
-
-Links:
-
-- Github_ : repository of the package
-- PythonHosted_ : documentation and tutorial of the package
-- PyPI_ : PyPI
-
-
-.. _Github: https://github.com/stephenhky/PyShortTextCategorization
-.. _PythonHosted: http://pythonhosted.org/shorttext/
-.. _PyPI: https://pypi.python.org/pypi/shorttext
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
diff --git a/readthedocs/source/install.rst b/readthedocs/source/install.rst
deleted file mode 100644
index 782522c9..00000000
--- a/readthedocs/source/install.rst
+++ /dev/null
@@ -1,46 +0,0 @@
-Installation Guide
-==================
-
-To install the package in Linux or OS X, enter the following in the console:
-
-::
-
- pip install -U shorttext
-
-It is very possible that you have to do it as root, that you have to add ``sudo`` in
-front of the command.
-
-However, the repository on Python Package Index is not always the most updated. To get
-the most updated (not official) version, you can install from Github_:
-
-::
-
- pip install -U git+https://github.com/stephenhky/PyShortTextCategorization@master
-
-By adding ``-U`` in the command, it automatically installs the required packages. If not,
-you have to install these packages on your own.
-
-.. _Github: https://github.com/stephenhky/PyShortTextCategorization
-
-Required Packages
------------------
-
-- Numpy_ (Numerical Python)
-- SciPy_ (Scientific Python)
-- Scikit-Learn_ (Machine Learning in Python)
-- Theano_ (Symbolic Computing for Deep Learning)
-- keras_ (Deep Learning Library for Theano and Tensorflow)
-- gensim_ (Topic Modeling for Humans)
-- Pandas_ (Python Data Analysis Library)
-- spaCy_ (Industrial Strenglth Natural Language Processing in Python)
-- stemming_ (stemming in Python)
-
-.. _Numpy: http://www.numpy.org/
-.. _SciPy: https://www.scipy.org/
-.. _Scikit-Learn: http://scikit-learn.org/stable/
-.. _Theano: http://deeplearning.net/software/theano/
-.. _keras: https://keras.io/
-.. _gensim: https://radimrehurek.com/gensim/
-.. _Pandas: http://pandas.pydata.org/
-.. _spaCy: https://spacy.io/
-.. _stemming: https://pypi.python.org/pypi/stemming/
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 356e5b07..00000000
--- a/setup.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from setuptools import setup
-
-def readme():
- with open('README.md') as f:
- return f.read()
-
-setup(name='shorttext',
- version="0.4.0",
- description="Short Text Categorization",
- long_description="Supervised learning algorithms for short text categorization using embedded word vectors such as Word2Vec, or immediate feature vectors using topic models",
- classifiers=[
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
- "Natural Language :: English",
- "Topic :: Scientific/Engineering :: Mathematics",
- "Programming Language :: Python :: 2.7",
- "License :: OSI Approved :: MIT License",
- ],
- keywords="short text natural language processing text mining",
- url="https://github.com/stephenhky/PyShortTextCategorization",
- author="Kwan-Yuet Ho",
- author_email="stephenhky@yahoo.com.hk",
- license='MIT',
- packages=['shorttext',
- 'shorttext.utils',
- 'shorttext.classifiers',
- 'shorttext.classifiers.embed',
- 'shorttext.classifiers.embed.nnlib',
- 'shorttext.classifiers.embed.sumvec',
- 'shorttext.classifiers.bow',
- 'shorttext.classifiers.bow.topic',
- 'shorttext.classifiers.bow.maxent',
- 'shorttext.data',
- 'shorttext.stack',
- 'shorttext.generators',
- 'shorttext.generators.bow'],
- package_dir={'shorttext': 'shorttext'},
- package_data={'shorttext': ['data/*.csv', 'utils/*.pkl']},
- setup_requires=['numpy'],
- install_requires=[
- 'numpy', 'scipy', 'scikit-learn', 'keras>=2.0.0', 'gensim>=2.2.0', 'pandas', 'spacy', 'stemming',
- ],
- scripts=['bin/ShortTextCategorizerConsole',
- 'bin/ShortTextWord2VecSimilarity',
- 'bin/switch_kerasbackend'],
- # include_package_data=False,
- zip_safe=False)
diff --git a/shorttext/__init__.py b/shorttext/__init__.py
deleted file mode 100644
index 160306cc..00000000
--- a/shorttext/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-import sys
-
-thisdir, _ = os.path.split(__file__)
-sys.path.append(thisdir)
-
-from . import utils
-from . import data
-from . import classifiers
-from . import generators
-from . import stack
-from .smartload import smartload_compact_model
\ No newline at end of file
diff --git a/shorttext/classifiers/bow/topic/LatentTopicModeling.py b/shorttext/classifiers/bow/topic/LatentTopicModeling.py
deleted file mode 100644
index fc109bc5..00000000
--- a/shorttext/classifiers/bow/topic/LatentTopicModeling.py
+++ /dev/null
@@ -1,12 +0,0 @@
-
-# for backward compatibility
-
-from shorttext.generators.bow.GensimTopicModeling import gensim_topic_model_dict
-from shorttext.generators.bow.LatentTopicModeling import LatentTopicModeler
-from shorttext.generators.bow.GensimTopicModeling import GensimTopicModeler
-from shorttext.generators.bow.GensimTopicModeling import LDAModeler
-from shorttext.generators.bow.GensimTopicModeling import LSIModeler
-from shorttext.generators.bow.GensimTopicModeling import RPModeler
-from shorttext.generators.bow.AutoEncodingTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel
-from shorttext.generators import load_gensimtopicmodel
-from shorttext.generators import load_autoencoder_topicmodel as load_autoencoder_topic
diff --git a/shorttext/data/__init__.py b/shorttext/data/__init__.py
deleted file mode 100644
index 58cf1524..00000000
--- a/shorttext/data/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .data_retrieval import subjectkeywords, nihreports, inaugual, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts
diff --git a/shorttext/generators/__init__.py b/shorttext/generators/__init__.py
deleted file mode 100644
index 9cd8fbe8..00000000
--- a/shorttext/generators/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .bow.GensimTopicModeling import load_gensimtopicmodel
-from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel
-
-from .bow.GensimTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
-from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
\ No newline at end of file
diff --git a/shorttext/utils/__init__.py b/shorttext/utils/__init__.py
deleted file mode 100644
index 8729b124..00000000
--- a/shorttext/utils/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from . import kerasmodel_io
-from . import classification_exceptions
-from . import gensim_corpora
-from . import textpreprocessing
-from .wordembed import load_word2vec_model
-from . import compactmodel_io
-
-from .textpreprocessing import spacy_tokenize as tokenize
-from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1
-
-from .deprecation import deprecated
diff --git a/shorttext/utils/deprecation.py b/shorttext/utils/deprecation.py
deleted file mode 100644
index 85516c79..00000000
--- a/shorttext/utils/deprecation.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import warnings
-
-def deprecated(func):
- """This is a decorator which can be used to mark functions
- as deprecated. It will result in a warning being emmitted
- when the function is used."""
- def newFunc(*args, **kwargs):
- warnings.warn("Call to deprecated function %s." % func.__name__,
- category=DeprecationWarning)
- return func(*args, **kwargs)
- newFunc.__name__ = func.__name__
- newFunc.__doc__ = func.__doc__
- newFunc.__dict__.update(func.__dict__)
- return newFunc
\ No newline at end of file
diff --git a/shorttext/utils/stopwordset.pkl b/shorttext/utils/stopwordset.pkl
deleted file mode 100644
index cfde1e9a..00000000
--- a/shorttext/utils/stopwordset.pkl
+++ /dev/null
@@ -1,4201 +0,0 @@
-c__builtin__
-set
-p0
-((lp1
-Vwrde
-p2
-aVebben
-p3
-aVnegl
-p4
-aVdere
-p5
-aVniiss
-p6
-aVotro
-p7
-aVnoista
-p8
-aVdazu
-p9
-aVauf
-p10
-aVotra
-p11
-aVaus
-p12
-aVm\u0131
-p13
-aVunder
-p14
-aVaux
-p15
-aV\u043f\u043e\u0434
-p16
-aVdela
-p17
-aVdele
-p18
-aVdell
-p19
-aValgunos
-p20
-aVtinham
-p21
-aVolisimme
-p22
-aVhaben
-p23
-aVupp
-p24
-aVtenais
-p25
-aVfuera
-p26
-aVesos
-p27
-aVzonder
-p28
-aVfece
-p29
-aVvarit
-p30
-aVn
-p31
-aVellas
-p32
-aVfeci
-p33
-aVebbi
-p34
-aVnem
-p35
-aVavranno
-p36
-aVnei
-p37
-aVemilyen
-p38
-aVned
-p39
-aVtuolta
-p40
-aVwhile
-p41
-aVebbe
-p42
-aVkun
-p43
-aVestuvo
-p44
-aVabove
-p45
-aVestuve
-p46
-aVekkor
-p47
-aVfacevi
-p48
-aVstarei
-p49
-aVmed
-p50
-aVmeg
-p51
-aVdaha
-p52
-aVwezen
-p53
-aVmen
-p54
-aVfacesse
-p55
-aVhere
-p56
-aVmeu
-p57
-aVmet
-p58
-aVnicht
-p59
-aVmes
-p60
-aVltal
-p61
-aVhers
-p62
-aVamolyan
-p63
-aVsidan
-p64
-aVjolta
-p65
-aVgli
-p66
-aV\u0442\u043e\u0433\u043e
-p67
-aV\u043c\u043d\u0435
-p68
-aVniksi
-p69
-aVaki
-p70
-aVtt
-p71
-aVvosostras
-p72
-aVdella
-p73
-aVesteve
-p74
-aVdesselben
-p75
-aVerais
-p76
-aVsarai
-p77
-aV\u0431\u044b\u043b\u043e
-p78
-aVestive
-p79
-aV\u0431\u044b\u043b\u0438
-p80
-aVheille
-p81
-aVniin
-p82
-aVeiniges
-p83
-aVfussions
-p84
-aVloro
-p85
-aVseraient
-p86
-aVeinigem
-p87
-aVeinigen
-p88
-aVwollen
-p89
-aVuntil
-p90
-aVseja
-p91
-aV\u0443\u0436\u0435
-p92
-aVtendras
-p93
-aVtendran
-p94
-aVfacemmo
-p95
-aVabbiamo
-p96
-aV\u043d\u0435\u043b\u044c\u0437\u044f
-p97
-aVquale
-p98
-aVme
-p99
-aVolet
-p100
-aVfossimo
-p101
-aVma
-p102
-aVavremo
-p103
-aVmi
-p104
-aVminuun
-p105
-aVmu
-p106
-aVmellett
-p107
-aVera
-p108
-aVolen
-p109
-aVero
-p110
-aVakik
-p111
-aVavaient
-p112
-aVtiez
-p113
-aVtuvieses
-p114
-aVgegen
-p115
-aVeussent
-p116
-aVtuoksi
-p117
-aVauraient
-p118
-aVdykk
-p119
-aVwant
-p120
-aVtuviesen
-p121
-aVjra
-p122
-aVanderer
-p123
-aVanderes
-p124
-aVhoe
-p125
-aVend
-p126
-aVheill
-p127
-aVenn
-p128
-aVhow
-p129
-aVdieselben
-p130
-aVhos
-p131
-aVanderen
-p132
-aVanderem
-p133
-aVfacesti
-p134
-aVvocs
-p135
-aVsanki
-p136
-aVafter
-p137
-aVsera
-p138
-aV\u0442\u043e\u0433\u0434\u0430
-p139
-aVlas
-p140
-aVm
-p141
-aVhasta
-p142
-aVsentid
-p143
-aVeinmal
-p144
-aVvart
-p145
-aVvars
-p146
-aVaient
-p147
-aVover
-p148
-aVhubiese
-p149
-aVnostra
-p150
-aVvara
-p151
-aVnostre
-p152
-aVezzel
-p153
-aVnostri
-p154
-aVbefore
-p155
-aVgy
-p156
-aVbegge
-p157
-aVhon
-p158
-aVr
-p159
-aVthen
-p160
-aVthem
-p161
-aVsinulta
-p162
-aVp
-p163
-aVdeinem
-p164
-aVdeinen
-p165
-aVdeines
-p166
-aVdeiner
-p167
-aVthey
-p168
-aVmucho
-p169
-aVpelas
-p170
-aVwenn
-p171
-aVhubieran
-p172
-aVl
-p173
-aVestaris
-p174
-aVat
-p175
-aVeach
-p176
-aVdamit
-p177
-aVseriez
-p178
-aVn
-p179
-aVdiye
-p180
-aVhabris
-p181
-aVte
-p182
-aVmikor
-p183
-aVfacevamo
-p184
-aVaurais
-p185
-aVaurait
-p186
-aVkztt
-p187
-aVdiesen
-p188
-aVdieser
-p189
-aVdieses
-p190
-aVsuyas
-p191
-aVwerde
-p192
-aVicke
-p193
-aVkzl
-p194
-aVelle
-p195
-aVinom
-p196
-aVnoille
-p197
-aVbr
-p198
-aVnoilla
-p199
-aV\u0441\u043e\u0432\u0441\u0435\u043c
-p200
-aVbirka
-p201
-aVist
-p202
-aVatt
-p203
-aVstaranno
-p204
-aVhennar
-p205
-aVeiner
-p206
-aVeines
-p207
-aVavrebbe
-p208
-aVmin
-p209
-aVest
-p210
-aVtuoi
-p211
-aVniss
-p212
-aVtuon
-p213
-aVeinen
-p214
-aVfr
-p215
-aVhogyan
-p216
-aV\u043c\u043e\u0439
-p217
-aVr
-p218
-aVingen
-p219
-aVtot
-p220
-aVtoi
-p221
-aVton
-p222
-aVtoo
-p223
-aVteit
-p224
-aVtuossa
-p225
-aVelszr
-p226
-aVhiszen
-p227
-aVestavam
-p228
-aV\u043c\u043e\u044f
-p229
-aVminun
-p230
-aVverte
-p231
-aVbde
-p232
-aVfussent
-p233
-aVsille
-p234
-aVetter
-p235
-aVminua
-p236
-aVvarje
-p237
-aVhaya
-p238
-aVhemos
-p239
-aVminut
-p240
-aVhabidos
-p241
-aVsek
-p242
-aVszmra
-p243
-aVeste
-p244
-aV\u0447\u0435\u0433\u043e
-p245
-aVesta
-p246
-aVhabrn
-p247
-aVteht
-p248
-aVesto
-p249
-aVmine
-p250
-aVvagyok
-p251
-aVkorso
-p252
-aVhnelt
-p253
-aVhabrs
-p254
-aVmint
-p255
-aVtiveram
-p256
-aVniin
-p257
-aVfosti
-p258
-aVfoste
-p259
-aVderes
-p260
-aVderer
-p261
-aVhouveremos
-p262
-aVnit
-p263
-aVohne
-p264
-aVerano
-p265
-aVmeidt
-p266
-aVmeidn
-p267
-aVthe
-p268
-aV\u0435\u0441\u043b\u0438
-p269
-aVsomme
-p270
-aVfusse
-p271
-aVdon
-p272
-aVmajd
-p273
-aVsill
-p274
-aVm
-p275
-aVdog
-p276
-aVyours
-p277
-aVegyb
-p278
-aVdov
-p279
-aVyani
-p280
-aVs
-p281
-aVfussiez
-p282
-aVsentidos
-p283
-aVl
-p284
-aVn
-p285
-aVnlkl
-p286
-aVoch
-p287
-aVwirst
-p288
-aVdessen
-p289
-aVsarebbero
-p290
-aV\u044d\u0442\u043e\u0439
-p291
-aV\u043c\u0435\u043d\u044f
-p292
-aVolivat
-p293
-aVavrete
-p294
-aVdo
-p295
-aVmeget
-p296
-aVsobre
-p297
-aVdi
-p298
-aVde
-p299
-aVda
-p300
-aV\u0431\u0443\u0434\u0442\u043e
-p301
-aVgeen
-p302
-aVdu
-p303
-aV\u0447\u0435\u043c
-p304
-aV\u043d\u0438\u043a\u043e\u0433\u0434\u0430
-p305
-aVfurent
-p306
-aVstiamo
-p307
-aV\u043a\u043e\u043d\u0435\u0447\u043d\u043e
-p308
-aVsentida
-p309
-aVnichts
-p310
-aVestados
-p311
-aVsentido
-p312
-aViets
-p313
-aVtlle
-p314
-aVblev
-p315
-aVsugl
-p316
-aVskulle
-p317
-aVtenham
-p318
-aVnur
-p319
-aV\u0443
-p320
-aVnun
-p321
-aVnuo
-p322
-aVnum
-p323
-aVblei
-p324
-aVinni
-p325
-aVegyes
-p326
-aVstaremmo
-p327
-aVhouvssemos
-p328
-aVweiter
-p329
-aVestiveram
-p330
-aVhabran
-p331
-aVwe
-p332
-aVlegyen
-p333
-aVdiese
-p334
-aVwo
-p335
-aVsoyons
-p336
-aVwere
-p337
-aVanche
-p338
-aVfuramos
-p339
-aVqueste
-p340
-aVuns
-p341
-aVquesta
-p342
-aVestuviera
-p343
-aVill
-p344
-aVquesto
-p345
-aVquesti
-p346
-aVagainst
-p347
-aVuna
-p348
-aVund
-p349
-aVune
-p350
-aVd
-p351
-aVcoi
-p352
-aVcom
-p353
-aVcol
-p354
-aVuno
-p355
-aVhaban
-p356
-aVforam
-p357
-aVtovbb
-p358
-aV\u043d\u0430\u0441
-p359
-aVdurante
-p360
-aV\u043e\u043f\u044f\u0442\u044c
-p361
-aVniye
-p362
-aVhabas
-p363
-aVfra
-p364
-aV\u0442\u0430\u043a
-p365
-aVtendrs
-p366
-aV\u015fu
-p367
-aVbeen
-p368
-aVestaban
-p369
-aVestabas
-p370
-aVnoen
-p371
-aVtenemos
-p372
-aVjoiden
-p373
-aVmme
-p374
-aVjossa
-p375
-aVnagyon
-p376
-aVaies
-p377
-aVzich
-p378
-aVhvilken
-p379
-aVstavano
-p380
-aVsuas
-p381
-aVmivel
-p382
-aVavessimo
-p383
-aVkeist
-p384
-aVaqueles
-p385
-aVauras
-p386
-aVait
-p387
-aVavrei
-p388
-aVseras
-p389
-aVegyik
-p390
-aVdello
-p391
-aVmilt
-p392
-aVserai
-p393
-aVvagy
-p394
-aVaie
-p395
-aVdelle
-p396
-aVaurai
-p397
-aVfaremmo
-p398
-aVunos
-p399
-aVtss
-p400
-aVis
-p401
-aVit
-p402
-aVik
-p403
-aVim
-p404
-aVil
-p405
-aVio
-p406
-aVin
-p407
-aVif
-p408
-aVmeit
-p409
-aVmanchem
-p410
-aVmanchen
-p411
-aVmanches
-p412
-aVmancher
-p413
-aV\u0441
-p414
-aVkeressnk
-p415
-aVmanche
-p416
-aVella
-p417
-aVjetzt
-p418
-aVeltt
-p419
-aVkim
-p420
-aVhans
-p421
-aVkvarhelst
-p422
-aVdepois
-p423
-aVihrem
-p424
-aVihren
-p425
-aVstettero
-p426
-aVihres
-p427
-aVavrebbero
-p428
-aVnoihin
-p429
-aVandere
-p430
-aVjust
-p431
-aV\u043d\u0430\u0434
-p432
-aVanderm
-p433
-aVandern
-p434
-aVikkje
-p435
-aVanderr
-p436
-aVanders
-p437
-aVestiverem
-p438
-aVfarei
-p439
-aVhai
-p440
-aVantes
-p441
-aVezen
-p442
-aVham
-p443
-aVhan
-p444
-aVkell
-p445
-aVhab
-p446
-aVela
-p447
-aVhad
-p448
-aVestuvieron
-p449
-aVele
-p450
-aVhay
-p451
-aVbaz\u0131
-p452
-aVhar
-p453
-aVhas
-p454
-aVhat
-p455
-aVopp
-p456
-aVunter
-p457
-aV\u0442\u0443\u0442
-p458
-aVmis
-p459
-aVd
-p460
-aVstavate
-p461
-aVmykje
-p462
-aVniden
-p463
-aV\u0435\u0441\u0442\u044c
-p464
-aVszemben
-p465
-aVole
-p466
-aVkunne
-p467
-aVoli
-p468
-aVfacendo
-p469
-aVtendramos
-p470
-aVsinussa
-p471
-aVvrt
-p472
-aVich
-p473
-aVvr
-p474
-aVfor
-p475
-aVvalaki
-p476
-aVmuito
-p477
-aVuma
-p478
-aVseramos
-p479
-aVtuviese
-p480
-aVfoi
-p481
-aVel
-p482
-aVannak
-p483
-aVunse
-p484
-aVdette
-p485
-aVestuvieran
-p486
-aVhvordan
-p487
-aVeravate
-p488
-aVestuvieras
-p489
-aVtenidas
-p490
-aVzur
-p491
-aVvra
-p492
-aVtheirs
-p493
-aV\u043a\u0443\u0434\u0430
-p494
-aVo
-p495
-aVsers
-p496
-aVsern
-p497
-aValgo
-p498
-aVkellett
-p499
-aVeiniger
-p500
-aVzum
-p501
-aVaber
-p502
-aVsok
-p503
-aV\u0437\u0430
-p504
-aVsom
-p505
-aVerre
-p506
-aVson
-p507
-aVdown
-p508
-aVsou
-p509
-aVstavamo
-p510
-aVsoy
-p511
-aVframos
-p512
-aVjene
-p513
-aVfmes
-p514
-aVniist
-p515
-aVavait
-p516
-aVkven
-p517
-aVkeille
-p518
-aVn
-p519
-aVavais
-p520
-aV\u0432\u0441\u044e
-p521
-aVminusta
-p522
-aVwat
-p523
-aVwas
-p524
-aVwar
-p525
-aVsuyos
-p526
-aVellos
-p527
-aVuten
-p528
-aVveya
-p529
-aVazutn
-p530
-aVfora
-p531
-aVavions
-p532
-aVfel
-p533
-aVabbiate
-p534
-aVtuviste
-p535
-aVmuss
-p536
-aVhj
-p537
-aVtengan
-p538
-aVnerde
-p539
-aVtenan
-p540
-aVazt
-p541
-aVtengas
-p542
-aVtena
-p543
-aVfacevano
-p544
-aVefter
-p545
-aVabbiano
-p546
-aVhabamos
-p547
-aVni
-p548
-aVno
-p549
-aVna
-p550
-aVtivemos
-p551
-aVwhen
-p552
-aVkeill
-p553
-aVsomt
-p554
-aVtil
-p555
-aVjona
-p556
-aVnu
-p557
-aV\u0441\u043e
-p558
-aVtuyas
-p559
-aVkva
-p560
-aVdies
-p561
-aVser
-p562
-aV\u0432\u0430\u043c
-p563
-aVhavemos
-p564
-aVkvi
-p565
-aVsiente
-p566
-aVjoita
-p567
-aVjosta
-p568
-aVser
-p569
-aV\u0447\u0443\u0442\u044c
-p570
-aVthi
-p571
-aVoss
-p572
-aVsollte
-p573
-aVaurions
-p574
-aVselbst
-p575
-aVsejam
-p576
-aV\u0440\u0430\u0437\u0432\u0435
-p577
-aVhabidas
-p578
-aVquienes
-p579
-aVved
-p580
-aVvem
-p581
-aVfuiste
-p582
-aVert
-p583
-aVramos
-p584
-aVihnen
-p585
-aVhouveria
-p586
-aVhatten
-p587
-aVbelki
-p588
-aVdin
-p589
-aVestis
-p590
-aVdid
-p591
-aVdie
-p592
-aVwarst
-p593
-aVdig
-p594
-aVmeille
-p595
-aVsomos
-p596
-aV\u0431\u043e\u043b\u0435\u0435
-p597
-aVdit
-p598
-aVdir
-p599
-aVville
-p600
-aVhvem
-p601
-aVhver
-p602
-aVmely
-p603
-aVfacessi
-p604
-aVfssemos
-p605
-aVjoilla
-p606
-aVjoille
-p607
-aVsareste
-p608
-aVsaresti
-p609
-aVserait
-p610
-aVnegli
-p611
-aVsemmi
-p612
-aV\u0432\u0435\u0434\u044c
-p613
-aVmy
-p614
-aVmagt
-p615
-aVlegalbb
-p616
-aVmeill
-p617
-aVgeweest
-p618
-aVtendr
-p619
-aVamely
-p620
-aVnel
-p621
-aVhabais
-p622
-aVmelyek
-p623
-aVnereye
-p624
-aVpero
-p625
-aVabban
-p626
-aVtm
-p627
-aVteljes
-p628
-aVvous
-p629
-aVauront
-p630
-aVaurons
-p631
-aVook
-p632
-aV\u0432\u0441\u0435\u0433\u043e
-p633
-aVsiate
-p634
-aVjag
-p635
-aV\u0438\u043d\u043e\u0433\u0434\u0430
-p636
-aV\u0432\u0441\u0435\u0433\u0434\u0430
-p637
-aVvoc
-p638
-aVseremos
-p639
-aVs
-p640
-aVcan
-p641
-aVs
-p642
-aVtuviera
-p643
-aVs
-p644
-aVihre
-p645
-aVfossem
-p646
-aVmintha
-p647
-aVnoch
-p648
-aVhayamos
-p649
-aVnoita
-p650
-aVnyt
-p651
-aVtill
-p652
-aVtengamos
-p653
-aVmas
-p654
-aV\u043d\u043e
-p655
-aVhabremos
-p656
-aVso
-p657
-aVtuyos
-p658
-aVaos
-p659
-aVsuch
-p660
-aVlhes
-p661
-aVdove
-p662
-aVman
-p663
-aVhneen
-p664
-aVpoikki
-p665
-aVsu
-p666
-aVbenne
-p667
-aVsi
-p668
-aVso
-p669
-aVarrl
-p670
-aVsa
-p671
-aVse
-p672
-aVtksi
-p673
-aVsejamos
-p674
-aVhende
-p675
-aVhubierais
-p676
-aVstia
-p677
-aVmais
-p678
-aVavessi
-p679
-aVestaras
-p680
-aVfacciate
-p681
-aVestuviese
-p682
-aVestaran
-p683
-aVjoihin
-p684
-aVfuisteis
-p685
-aVnon
-p686
-aVnoi
-p687
-aVnoe
-p688
-aVnog
-p689
-aVaquilo
-p690
-aVkeit
-p691
-aVnot
-p692
-aVqu
-p693
-aVnow
-p694
-aVnor
-p695
-aVnos
-p696
-aV\u043a\u0430\u043a\u043e\u0439
-p697
-aVavons
-p698
-aVolit
-p699
-aVhanno
-p700
-aVolin
-p701
-aV\u0435\u0433\u043e
-p702
-aVkann
-p703
-aVel
-p704
-aVen
-p705
-aVei
-p706
-aVej
-p707
-aVennek
-p708
-aVed
-p709
-aVeg
-p710
-aVhaba
-p711
-aV\u0438\u043c
-p712
-aVez
-p713
-aVeu
-p714
-aVet
-p715
-aV\u0438\u0437
-p716
-aVes
-p717
-aVer
-p718
-aVstiano
-p719
-aVmeist
-p720
-aVdort
-p721
-aVhajam
-p722
-aVpoco
-p723
-aVbli
-p724
-aVble
-p725
-aV\u0434\u0432\u0430
-p726
-aV\u044d\u0442\u043e\u0433\u043e
-p727
-aVvoltunk
-p728
-aVdenselben
-p729
-aVtienen
-p730
-aVfarebbero
-p731
-aVauch
-p732
-aVs
-p733
-aVtero
-p734
-aVtienes
-p735
-aVstarebbe
-p736
-aVter
-p737
-aV\u0438\u0445
-p738
-aVemes
-p739
-aV\u043a\u0430\u043a\u0430\u044f
-p740
-aVnoiden
-p741
-aVzal
-p742
-aVnoget
-p743
-aVyourself
-p744
-aVbell
-p745
-aVamelyekben
-p746
-aVons
-p747
-aVtenhamos
-p748
-aVdegl
-p749
-aVont
-p750
-aVjenem
-p751
-aVjenen
-p752
-aVestn
-p753
-aVikke
-p754
-aVests
-p755
-aVcikkek
-p756
-aVjenes
-p757
-aVjener
-p758
-aVvilka
-p759
-aVms
-p760
-aVmr
-p761
-aVthat
-p762
-aVhun
-p763
-aVet
-p764
-aVlhe
-p765
-aVqual
-p766
-aVhur
-p767
-aVegyetlen
-p768
-aVthan
-p769
-aVhubiramos
-p770
-aVamikor
-p771
-aV\u0431\u044b\u043b
-p772
-aVwerd
-p773
-aVavevano
-p774
-aVand
-p775
-aValles
-p776
-aValler
-p777
-aVsull
-p778
-aVhimself
-p779
-aVallen
-p780
-aVallem
-p781
-aVmiksi
-p782
-aVany
-p783
-aV\u0442\u0430\u043a\u043e\u0439
-p784
-aVtions
-p785
-aVestamos
-p786
-aV\u0445\u043e\u0442\u044c
-p787
-aVzou
-p788
-aVlehetett
-p789
-aVwird
-p790
-aVmsik
-p791
-aVamit
-p792
-aVdykkar
-p793
-aVahogy
-p794
-aVjoksi
-p795
-aVniin
-p796
-aVsaremo
-p797
-aVheist
-p798
-aVlenni
-p799
-aVsolche
-p800
-aVlenne
-p801
-aVseria
-p802
-aVestejam
-p803
-aVtnhamos
-p804
-aVelas
-p805
-aVestoy
-p806
-aVestos
-p807
-aVestou
-p808
-aVsilt
-p809
-aVayants
-p810
-aVdaar
-p811
-aVserais
-p812
-aVdich
-p813
-aVppen
-p814
-aV\u0430
-p815
-aVonly
-p816
-aVayante
-p817
-aV\u043d\u0443
-p818
-aV\u0432\u043e\u0442
-p819
-aVho
-p820
-aVolimme
-p821
-aV\u043d\u0435
-p822
-aV\u043c\u043e\u0436\u0435\u0442
-p823
-aVtoch
-p824
-aV\u043d\u0430
-p825
-aV\u0441\u0435\u0431\u0435
-p826
-aVehhez
-p827
-aVnell
-p828
-aV\u043d\u0438
-p829
-aVj
-p830
-aVtuviramos
-p831
-aVkenelt
-p832
-aVdasselbe
-p833
-aVj
-p834
-aVavendo
-p835
-aVsiihen
-p836
-aVfr
-p837
-aVleur
-p838
-aVamg
-p839
-aVwhere
-p840
-aVseas
-p841
-aVvolna
-p842
-aVkvifor
-p843
-aVolisivat
-p844
-aVsean
-p845
-aVfacessimo
-p846
-aVhvorfor
-p847
-aVestad
-p848
-aVtuolla
-p849
-aVestar
-p850
-aVestas
-p851
-aVutols
-p852
-aVbetween
-p853
-aVnostro
-p854
-aVtiverem
-p855
-aVju
-p856
-aVtus
-p857
-aVmellan
-p858
-aVtst
-p859
-aVjo
-p860
-aVcomo
-p861
-aVtuo
-p862
-aVtua
-p863
-aVcome
-p864
-aVja
-p865
-aVkoska
-p866
-aV\u043e\u043d\u0430
-p867
-aVestivermos
-p868
-aV\u043e\u043d\u0438
-p869
-aVtm
-p870
-aVs
-p871
-aVaura
-p872
-aVkenen
-p873
-aVvuestro
-p874
-aVt
-p875
-aVhva
-p876
-aVavevamo
-p877
-aVpor
-p878
-aVteill
-p879
-aVje
-p880
-aVante
-p881
-aVmaga
-p882
-aVsiete
-p883
-aVm
-p884
-aVoder
-p885
-aVtue
-p886
-aV\u0441\u0435\u0431\u044f
-p887
-aVhepsi
-p888
-aVhouverem
-p889
-aVtenas
-p890
-aVfr
-p891
-aVthose
-p892
-aVhouverei
-p893
-aVmyself
-p894
-aVeit
-p895
-aVthese
-p896
-aVminulle
-p897
-aVminulla
-p898
-aV\u0442\u0435\u043f\u0435\u0440\u044c
-p899
-aVvaikka
-p900
-aVein
-p901
-aVestvamos
-p902
-aVeran
-p903
-aVeram
-p904
-aVsoit
-p905
-aVteille
-p906
-aVsois
-p907
-aVmirt
-p908
-aVpar
-p909
-aVpas
-p910
-aVsentidas
-p911
-aVsame
-p912
-aVhvilke
-p913
-aVya
-p914
-aVeri
-p915
-aVhvis
-p916
-aVngra
-p917
-aVstaresti
-p918
-aVeitt
-p919
-aVolleet
-p920
-aVquien
-p921
-aVfecero
-p922
-aVstareste
-p923
-aV\u043d\u0438\u0447\u0435\u0433\u043e
-p924
-aVtenha
-p925
-aVdefa
-p926
-aVhebben
-p927
-aVtenho
-p928
-aVperch
-p929
-aVestivessem
-p930
-aV\u0441\u0430\u043c
-p931
-aVtivessem
-p932
-aVmachen
-p933
-aVnoka
-p934
-aVsoient
-p935
-aVsiit
-p936
-aVaztn
-p937
-aVdeze
-p938
-aVbeing
-p939
-aVne
-p940
-aVt
-p941
-aVtuvisemos
-p942
-aV\u0432\u0430\u0441
-p943
-aVquella
-p944
-aVavesti
-p945
-aVaveste
-p946
-aV\u043c\u0435\u0436\u0434\u0443
-p947
-aVquello
-p948
-aVlett
-p949
-aVami
-p950
-aVditt
-p951
-aV\u0438\u043b\u0438
-p952
-aVama
-p953
-aVserei
-p954
-aVmot
-p955
-aVmoi
-p956
-aVmon
-p957
-aVtanto
-p958
-aVsdan
-p959
-aVserez
-p960
-aVvosostros
-p961
-aVmod
-p962
-aVaurez
-p963
-aVeussiez
-p964
-aV\u043f\u043e\u0442\u043e\u043c\u0443
-p965
-aVt
-p966
-aVnosotras
-p967
-aV\u0442\u043e
-p968
-aVavrai
-p969
-aVkorleis
-p970
-aVezt
-p971
-aVssze
-p972
-aVhouveramos
-p973
-aVeddig
-p974
-aVnaar
-p975
-aVhavde
-p976
-aVeurer
-p977
-aVsommes
-p978
-aVjobban
-p979
-aVmeinen
-p980
-aVmeinem
-p981
-aVmeiner
-p982
-aVmeines
-p983
-aVtn
-p984
-aVkenen
-p985
-aVsarebbe
-p986
-aVaveva
-p987
-aVseine
-p988
-aVfusemos
-p989
-aVsero
-p990
-aVkenet
-p991
-aVhabe
-p992
-aVon
-p993
-aVom
-p994
-aVhr
-p995
-aVog
-p996
-aVof
-p997
-aVob
-p998
-aVneden
-p999
-aVhn
-p1000
-aVou
-p1001
-aVos
-p1002
-aVor
-p1003
-aVop
-p1004
-aVkein
-p1005
-aVhouvera
-p1006
-aVhabiendo
-p1007
-aVesses
-p1008
-aVnihin
-p1009
-aVtuvisteis
-p1010
-aVyour
-p1011
-aVhet
-p1012
-aVwelches
-p1013
-aVwelcher
-p1014
-aVhep
-p1015
-aVher
-p1016
-aVthere
-p1017
-aVlos
-p1018
-aVtbb
-p1019
-aVstarete
-p1020
-aVeues
-p1021
-aVeuer
-p1022
-aVhem
-p1023
-aVwelchem
-p1024
-aVhei
-p1025
-aVwelchen
-p1026
-aVgibi
-p1027
-aVmich
-p1028
-aVwith
-p1029
-aVvere
-p1030
-aVhabis
-p1031
-aV
-p1032
-aVkeine
-p1033
-aVad
-p1034
-aVaf
-p1035
-aVvors
-p1036
-aV\u044d\u0442\u043e\u043c
-p1037
-aVam
-p1038
-aVal
-p1039
-aVao
-p1040
-aVan
-p1041
-aVas
-p1042
-aVsaremmo
-p1043
-aVau
-p1044
-aVuit
-p1045
-aVav
-p1046
-aVaz
-p1047
-aVtenga
-p1048
-aVvore
-p1049
-aVagain
-p1050
-aVesto
-p1051
-aVstar
-p1052
-aVmiss
-p1053
-aVpedig
-p1054
-aV\u0442\u043e\u0436\u0435
-p1055
-aVstar
-p1056
-aVyou
-p1057
-aVavessero
-p1058
-aVolitte
-p1059
-aVhubiste
-p1060
-aVetwas
-p1061
-aVsullo
-p1062
-aVsulla
-p1063
-aVsulle
-p1064
-aVtendris
-p1065
-aV\u043a\u0430\u043a
-p1066
-aVunsen
-p1067
-aVunsem
-p1068
-aVeusse
-p1069
-aVu
-p1070
-aV\u044d\u0442\u043e\u0442
-p1071
-aVunser
-p1072
-aVunses
-p1073
-aVestuviste
-p1074
-aVall
-p1075
-aV\u0442\u043e\u043c
-p1076
-aVnoilta
-p1077
-aVdiesem
-p1078
-aVnilt
-p1079
-aVnoissa
-p1080
-aValt
-p1081
-aVals
-p1082
-aVdr
-p1083
-aVtu
-p1084
-aVto
-p1085
-aVniilt
-p1086
-aVniiden
-p1087
-aVderselbe
-p1088
-aVti
-p1089
-aV\u0438
-p1090
-aVkvar
-p1091
-aVte
-p1092
-aVta
-p1093
-aVestando
-p1094
-aVber
-p1095
-aVestaba
-p1096
-aVvery
-p1097
-aVsono
-p1098
-aVfai
-p1099
-aVsont
-p1100
-aVval
-p1101
-aVtuvimos
-p1102
-aVminden
-p1103
-aVworden
-p1104
-aVsinulla
-p1105
-aVhendes
-p1106
-aVjoista
-p1107
-aVsinulle
-p1108
-aVhabr
-p1109
-aVts
-p1110
-aVhabr
-p1111
-aVtem
-p1112
-aValtijd
-p1113
-aVhaar
-p1114
-aVkunnen
-p1115
-aV\u043d\u0430\u0434\u043e
-p1116
-aVfurther
-p1117
-aVtes
-p1118
-aVteu
-p1119
-aV\u043d\u0435\u0435
-p1120
-aVwhat
-p1121
-aVt
-p1122
-aVhnest
-p1123
-aVsua
-p1124
-aVsuo
-p1125
-aVsul
-p1126
-aVsui
-p1127
-aV\u043c\u043d\u043e\u0433\u043e
-p1128
-aVsus
-p1129
-aVsur
-p1130
-aVdeles
-p1131
-aVjede
-p1132
-aViemand
-p1133
-aVfarete
-p1134
-aVhadde
-p1135
-aV\u0442\u0430\u043c
-p1136
-aVtoen
-p1137
-aVegsz
-p1138
-aV
-p1139
-aVahhoz
-p1140
-aVeras
-p1141
-aVavesse
-p1142
-aVstava
-p1143
-aVdurch
-p1144
-aVseris
-p1145
-aVhnelle
-p1146
-aVvid
-p1147
-aVthn
-p1148
-aVvil
-p1149
-aVotros
-p1150
-aVhogy
-p1151
-aVfueras
-p1152
-aVtutto
-p1153
-aVminussa
-p1154
-aVtutti
-p1155
-aVvarte
-p1156
-aVhouver
-p1157
-aVdieselbe
-p1158
-aVfueran
-p1159
-aVsondern
-p1160
-aVmore
-p1161
-aVmellom
-p1162
-aVdoor
-p1163
-aVfusses
-p1164
-aVhubieras
-p1165
-aV\u0440\u0430\u0437
-p1166
-aVnk
-p1167
-aVteist
-p1168
-aVnerede
-p1169
-aVder
-p1170
-aVdes
-p1171
-aVdet
-p1172
-aVdei
-p1173
-aVminhas
-p1174
-aVdel
-p1175
-aVdem
-p1176
-aVden
-p1177
-aVtuas
-p1178
-aVdeg
-p1179
-aVhnell
-p1180
-aVwieder
-p1181
-aVserais
-p1182
-aVavemmo
-p1183
-aVmesmo
-p1184
-aVtant
-p1185
-aVknnen
-p1186
-aVvoltam
-p1187
-aVvoltak
-p1188
-aVnagyobb
-p1189
-aVfu
-p1190
-aVtuvieseis
-p1191
-aVnotre
-p1192
-aVtuona
-p1193
-aV\u044d\u0442\u0438
-p1194
-aVnuma
-p1195
-aVno
-p1196
-aVa
-p1197
-aVegy
-p1198
-aVestuviramos
-p1199
-aVkein
-p1200
-aV\u0437\u0430\u0447\u0435\u043c
-p1201
-aVise
-p1202
-aVhayis
-p1203
-aVthrough
-p1204
-aVitt
-p1205
-aVits
-p1206
-aVzelf
-p1207
-aValle
-p1208
-aValla
-p1209
-aVallo
-p1210
-aVjoissa
-p1211
-aVsinusta
-p1212
-aVteidn
-p1213
-aVhubisemos
-p1214
-aVallt
-p1215
-aVteidt
-p1216
-aVhvor
-p1217
-aVnossos
-p1218
-aVmusste
-p1219
-aV\u043f\u043e\u0447\u0442\u0438
-p1220
-aVyo
-p1221
-aVjl
-p1222
-aV\u043f\u0440\u043e
-p1223
-aV\u043f\u0440\u0438
-p1224
-aVmitk
-p1225
-aVces
-p1226
-aVilyenkor
-p1227
-aVtuvieran
-p1228
-aVwrden
-p1229
-aVdenne
-p1230
-aVheihin
-p1231
-aVdenna
-p1232
-aVvannak
-p1233
-aVft
-p1234
-aVstemmo
-p1235
-aVhubieseis
-p1236
-aVnossa
-p1237
-aV\u043d\u0438\u043c
-p1238
-aVbir\u015fey
-p1239
-aVugyanis
-p1240
-aVtodo
-p1241
-aV\u0432\u043f\u0440\u043e\u0447\u0435\u043c
-p1242
-aVnosso
-p1243
-aVeinem
-p1244
-aVtenida
-p1245
-aVserions
-p1246
-aVmas
-p1247
-aVtenido
-p1248
-aVsuoi
-p1249
-aVsiin
-p1250
-aVjolle
-p1251
-aVseis
-p1252
-aVjolla
-p1253
-aVestaremos
-p1254
-aVlehet
-p1255
-aVmukaan
-p1256
-aV\u044f
-p1257
-aVnokre
-p1258
-aVtuohon
-p1259
-aV\u043d\u0438\u0445
-p1260
-aVestada
-p1261
-aVvoor
-p1262
-aVnosotros
-p1263
-aVestejamos
-p1264
-aVtivesse
-p1265
-aVmindenki
-p1266
-aVhubiera
-p1267
-aV\u043e\u043d
-p1268
-aVdina
-p1269
-aV\u043e\u0431
-p1270
-aVnach
-p1271
-aVtants
-p1272
-aVngot
-p1273
-aVtuvieras
-p1274
-aVtendremos
-p1275
-aV\u0431\u044b
-p1276
-aVtante
-p1277
-aVjer
-p1278
-aVkom
-p1279
-aVteiss
-p1280
-aVkon
-p1281
-aVesas
-p1282
-aVavec
-p1283
-aVavez
-p1284
-aVcontra
-p1285
-aVjeg
-p1286
-aV\u043a
-p1287
-aVseamos
-p1288
-aVcontro
-p1289
-aVmos
-p1290
-aVpara
-p1291
-aVsera
-p1292
-aVtive
-p1293
-aVsta
-p1294
-aVaan
-p1295
-aVdans
-p1296
-aVdann
-p1297
-aVteve
-p1298
-aV\u043e\u0442
-p1299
-aVteriam
-p1300
-aVheilt
-p1301
-aVtiver
-p1302
-aViin
-p1303
-aVfaccio
-p1304
-aVsnn
-p1305
-aVnoiksi
-p1306
-aVestuvieses
-p1307
-aV\u0442\u0440\u0438
-p1308
-aVeuch
-p1309
-aVfaccia
-p1310
-aVhenne
-p1311
-aValso
-p1312
-aVnin
-p1313
-aVestuviesen
-p1314
-aVtodos
-p1315
-aVnuestras
-p1316
-aVselv
-p1317
-aVszinte
-p1318
-aVtuve
-p1319
-aVestivesse
-p1320
-aVtuvo
-p1321
-aVkan
-p1322
-aVessa
-p1323
-aVmost
-p1324
-aVesse
-p1325
-aV\u0431\u044b\u0442\u044c
-p1326
-aVtivera
-p1327
-aVminha
-p1328
-aVmeine
-p1329
-aVki
-p1330
-aVvarfr
-p1331
-aVhubieses
-p1332
-aVfossi
-p1333
-aVhubiesen
-p1334
-aVstessimo
-p1335
-aVfosse
-p1336
-aVetes
-p1337
-aV\u043a\u043e\u0433\u0434\u0430
-p1338
-aVezek
-p1339
-aVket
-p1340
-aVtened
-p1341
-aVolyan
-p1342
-aVquem
-p1343
-aVmina
-p1344
-aV\u0433\u0434\u0435
-p1345
-aVfaresti
-p1346
-aVfareste
-p1347
-aVvalami
-p1348
-aVjoiksi
-p1349
-aVporque
-p1350
-aVsteste
-p1351
-aVhis
-p1352
-aVasl\u0131nda
-p1353
-aVmein
-p1354
-aVesteja
-p1355
-aVestivssemos
-p1356
-aVstesti
-p1357
-aVmo
-p1358
-aVstando
-p1359
-aVduring
-p1360
-aVhij
-p1361
-aV\u0442\u044b
-p1362
-aVma
-p1363
-aVhim
-p1364
-aVhin
-p1365
-aVhouveriam
-p1366
-aV\u0445\u043e\u0440\u043e\u0448\u043e
-p1367
-aVvilket
-p1368
-aVvissza
-p1369
-aVseu
-p1370
-aVsto
-p1371
-aVses
-p1372
-aVfuesen
-p1373
-aVseg
-p1374
-aVfueses
-p1375
-aVegyre
-p1376
-aVbare
-p1377
-aVare
-p1378
-aVsea
-p1379
-aVsen
-p1380
-aVsem
-p1381
-aVsei
-p1382
-aVingi
-p1383
-aV\u0431\u0443\u0434\u0435\u0442
-p1384
-aVinkje
-p1385
-aVsonst
-p1386
-aVdein
-p1387
-aVdeim
-p1388
-aVsoll
-p1389
-aVdalla
-p1390
-aVjeden
-p1391
-aVjedem
-p1392
-aVdalle
-p1393
-aVdallo
-p1394
-aVison
-p1395
-aVestivemos
-p1396
-aVhi
-p1397
-aVjedes
-p1398
-aVjeder
-p1399
-aVboth
-p1400
-aVc
-p1401
-aVquelle
-p1402
-aVolisi
-p1403
-aVmink
-p1404
-aVseran
-p1405
-aVquelli
-p1406
-aVsamma
-p1407
-aVavr
-p1408
-aVsamme
-p1409
-aVolla
-p1410
-aVavr
-p1411
-aVauriez
-p1412
-aVhajamos
-p1413
-aVltalban
-p1414
-aVfummo
-p1415
-aVestuvisteis
-p1416
-aVteus
-p1417
-aVwhom
-p1418
-aV\u043c\u043e\u0436\u043d\u043e
-p1419
-aV\u0436
-p1420
-aVollut
-p1421
-aVdus
-p1422
-aVamelyeket
-p1423
-aVjohon
-p1424
-aVestuvierais
-p1425
-aVfut
-p1426
-aVfus
-p1427
-aVtenamos
-p1428
-aVmindig
-p1429
-aVfue
-p1430
-aVfui
-p1431
-aValatt
-p1432
-aVketk
-p1433
-aVvom
-p1434
-aVhnen
-p1435
-aVtaient
-p1436
-aVvoi
-p1437
-aVhnet
-p1438
-aVitself
-p1439
-aVvor
-p1440
-aVvos
-p1441
-aVns
-p1442
-aVacaba
-p1443
-aVfueron
-p1444
-aVkeiss
-p1445
-aVestes
-p1446
-aVnokon
-p1447
-aVkeiden
-p1448
-aVzwar
-p1449
-aVeure
-p1450
-aVcikkeket
-p1451
-aVnokor
-p1452
-aVels
-p1453
-aVentre
-p1454
-aVtengis
-p1455
-aV\u0437\u0434\u0435\u0441\u044c
-p1456
-aVeles
-p1457
-aVk
-p1458
-aVy
-p1459
-aVstesse
-p1460
-aVestadas
-p1461
-aVstessi
-p1462
-aVskal
-p1463
-aVnuestra
-p1464
-aVniill
-p1465
-aVnuestro
-p1466
-aVbliver
-p1467
-aVolisin
-p1468
-aVvagyis
-p1469
-aVe\u011fer
-p1470
-aVtendr
-p1471
-aVolisit
-p1472
-aVkeihin
-p1473
-aVhabras
-p1474
-aVcui
-p1475
-aVbin
-p1476
-aVhennes
-p1477
-aVbij
-p1478
-aVaviez
-p1479
-aVhayan
-p1480
-aVhabida
-p1481
-aVhayas
-p1482
-aVbiz
-p1483
-aVbis
-p1484
-aVhabido
-p1485
-aVfacciamo
-p1486
-aVtantes
-p1487
-aVhouveram
-p1488
-aVjota
-p1489
-aVvalamint
-p1490
-aVeen
-p1491
-aVsokkal
-p1492
-aVmycket
-p1493
-aVihrer
-p1494
-aVsue
-p1495
-aVjabb
-p1496
-aVkenell
-p1497
-aVsome
-p1498
-aVhinter
-p1499
-aVtovbb
-p1500
-aVt
-p1501
-aVilyen
-p1502
-aVaquele
-p1503
-aVourselves
-p1504
-aVaquela
-p1505
-aVminulta
-p1506
-aVterei
-p1507
-aVnhny
-p1508
-aVper
-p1509
-aValgunas
-p1510
-aVpelo
-p1511
-aVest
-p1512
-aVpela
-p1513
-aVbe
-p1514
-aVnello
-p1515
-aVnella
-p1516
-aVnelle
-p1517
-aV\u0447\u0442\u043e\u0431\u044b
-p1518
-aVbu
-p1519
-aVmutta
-p1520
-aVweil
-p1521
-aVby
-p1522
-aVvon
-p1523
-aVbist
-p1524
-aV\u0432\u0441\u0435
-p1525
-aVfomos
-p1526
-aVsaranno
-p1527
-aVyli
-p1528
-aVinte
-p1529
-aVteniendo
-p1530
-aVinto
-p1531
-aVkeneen
-p1532
-aVgewesen
-p1533
-aVkanssa
-p1534
-aVvaan
-p1535
-aVneki
-p1536
-aVheeft
-p1537
-aVdegli
-p1538
-aVfossero
-p1539
-aVkeneksi
-p1540
-aVavremmo
-p1541
-aVhnt
-p1542
-aVsuis
-p1543
-aVdeira
-p1544
-aVdessa
-p1545
-aVazok
-p1546
-aVkunde
-p1547
-aVazon
-p1548
-aVutna
-p1549
-aV\u043f\u043e\u0441\u043b\u0435
-p1550
-aV
-p1551
-aVver
-p1552
-aVut
-p1553
-aVuw
-p1554
-aVup
-p1555
-aVcikk
-p1556
-aVum
-p1557
-aVun
-p1558
-aVtuolle
-p1559
-aVud
-p1560
-aVnogle
-p1561
-aV\u043c\u044b
-p1562
-aV\u043b\u0443\u0447\u0448\u0435
-p1563
-aV\u043f\u0435\u0440\u0435\u0434
-p1564
-aV\u0434\u043e
-p1565
-aVnoko
-p1566
-aVelles
-p1567
-aVeller
-p1568
-aV\u0434\u0440\u0443\u0433\u043e\u0439
-p1569
-aVwollte
-p1570
-aV\u0434\u0430
-p1571
-aVnas
-p1572
-aVellen
-p1573
-aVhanem
-p1574
-aVcuando
-p1575
-aVsiden
-p1576
-aVtemos
-p1577
-aVavevo
-p1578
-aVavevi
-p1579
-aVeures
-p1580
-aVdall
-p1581
-aVstarai
-p1582
-aVderselben
-p1583
-aVtall
-p1584
-aVdos
-p1585
-aVeuren
-p1586
-aVgy
-p1587
-aVeurem
-p1588
-aVtendrais
-p1589
-aVagl
-p1590
-aVe
-p1591
-aVtmn
-p1592
-aVmuchos
-p1593
-aVhness
-p1594
-aVtendra
-p1595
-aVhaving
-p1596
-aVonce
-p1597
-aVtes
-p1598
-aVsitta
-p1599
-aVessas
-p1600
-aVkvl
-p1601
-aVkenest
-p1602
-aVge
-p1603
-aVstavi
-p1604
-aVstavo
-p1605
-aVknnte
-p1606
-aVsarei
-p1607
-aVnuestros
-p1608
-aVtaln
-p1609
-aVstiate
-p1610
-aVtivermos
-p1611
-aVniets
-p1612
-aVmaar
-p1613
-aVpersze
-p1614
-aVyourselves
-p1615
-aVsin
-p1616
-aVtra
-p1617
-aVtuosta
-p1618
-aV\u043a\u0442\u043e
-p1619
-aVniit
-p1620
-aVtinha
-p1621
-aVstarebbero
-p1622
-aVblitt
-p1623
-aVzo
-p1624
-aVze
-p1625
-aVvele
-p1626
-aVngon
-p1627
-aVzu
-p1628
-aVeurent
-p1629
-aVbiri
-p1630
-aVeinige
-p1631
-aVindem
-p1632
-aVtivramos
-p1633
-aVlei
-p1634
-aVtais
-p1635
-aVles
-p1636
-aVtait
-p1637
-aVsind
-p1638
-aVsine
-p1639
-aVsina
-p1640
-aVhonom
-p1641
-aVavreste
-p1642
-aVtegen
-p1643
-aVavresti
-p1644
-aVkeresztl
-p1645
-aV\u0447\u0442\u043e
-p1646
-aVovat
-p1647
-aVwie
-p1648
-aVwil
-p1649
-aVamelynek
-p1650
-aVett
-p1651
-aVvolt
-p1652
-aVwir
-p1653
-aVzijn
-p1654
-aV\u043d\u0438\u0431\u0443\u0434\u044c
-p1655
-aVmeihin
-p1656
-aVviszont
-p1657
-aVfrom
-p1658
-aVche
-p1659
-aVchi
-p1660
-aVfel
-p1661
-aV\u0447\u0435\u0440\u0435\u0437
-p1662
-aVfew
-p1663
-aVkuin
-p1664
-aVestabais
-p1665
-aVmindent
-p1666
-aVthemselves
-p1667
-aVzij
-p1668
-aV\u043e
-p1669
-aVfar
-p1670
-aVestars
-p1671
-aVslik
-p1672
-aVestuvieseis
-p1673
-aVtm
-p1674
-aVfar
-p1675
-aVvuestra
-p1676
-aVfarebbe
-p1677
-aVhatte
-p1678
-aVthis
-p1679
-aVsiksi
-p1680
-aVnekem
-p1681
-aVpour
-p1682
-aVmeer
-p1683
-aVvotre
-p1684
-aVfaceste
-p1685
-aVreeds
-p1686
-aVheidt
-p1687
-aVazrt
-p1688
-aVette
-p1689
-aVheidn
-p1690
-aVzwischen
-p1691
-aVseriam
-p1692
-aVtai
-p1693
-aVfrn
-p1694
-aVsit
-p1695
-aVsiz
-p1696
-aVsia
-p1697
-aVsig
-p1698
-aVwaren
-p1699
-aVcual
-p1700
-aVdelas
-p1701
-aVitse
-p1702
-aVsin
-p1703
-aV\u043f\u043e
-p1704
-aVfacevate
-p1705
-aVhouve
-p1706
-aVftes
-p1707
-aVestivramos
-p1708
-aVtenis
-p1709
-aVisso
-p1710
-aVbe
-p1711
-aVestarn
-p1712
-aVj
-p1713
-aVolisitte
-p1714
-aVazonban
-p1715
-aV\u043e\u0434\u0438\u043d
-p1716
-aVmg
-p1717
-aVle
-p1718
-aVla
-p1719
-aVeue
-p1720
-aVlo
-p1721
-aV\u0432\u043e
-p1722
-aVli
-p1723
-aVdemselben
-p1724
-aVkeiksi
-p1725
-aVeux
-p1726
-aVeut
-p1727
-aVeus
-p1728
-aVsie
-p1729
-aVdal
-p1730
-aVdan
-p1731
-aVnill
-p1732
-aVdai
-p1733
-aVdat
-p1734
-aVdoch
-p1735
-aVdas
-p1736
-aVqu
-p1737
-aV\u0432\u044b
-p1738
-aVstette
-p1739
-aVstetti
-p1740
-aVhossen
-p1741
-aVnm
-p1742
-aVsolches
-p1743
-aVsolcher
-p1744
-aVvilken
-p1745
-aVhabra
-p1746
-aVhubisteis
-p1747
-aVdoing
-p1748
-aVmijn
-p1749
-aVjoilta
-p1750
-aVolemme
-p1751
-aVour
-p1752
-aVsolchen
-p1753
-aVsolchem
-p1754
-aVout
-p1755
-aVtuya
-p1756
-aVtuyo
-p1757
-aVolette
-p1758
-aVstessero
-p1759
-aVomdat
-p1760
-aVderas
-p1761
-aVfuerais
-p1762
-aVfaceva
-p1763
-aVeravamo
-p1764
-aVmeilt
-p1765
-aVnille
-p1766
-aVfacevo
-p1767
-aVformos
-p1768
-aVill.
-p1769
-aVque
-p1770
-aVqui
-p1771
-aVfuimos
-p1772
-aVmilyen
-p1773
-aVsintiendo
-p1774
-aV\u0434\u0430\u0436\u0435
-p1775
-aVtuvierais
-p1776
-aV\u0434\u043b\u044f
-p1777
-aVda
-p1778
-aVihr
-p1779
-aVfurono
-p1780
-aVsjl
-p1781
-aV\u015fey
-p1782
-aVsuya
-p1783
-aVihn
-p1784
-aVakkor
-p1785
-aVilletve
-p1786
-aVihm
-p1787
-aVsuyo
-p1788
-aVestara
-p1789
-aVestemos
-p1790
-aVtheir
-p1791
-aV\u0432
-p1792
-aVabbia
-p1793
-aVmill
-p1794
-aVteilt
-p1795
-aVblivit
-p1796
-aVheb
-p1797
-aVquando
-p1798
-aVebbero
-p1799
-aV\u0441\u0435\u0439\u0447\u0430\u0441
-p1800
-aV\u0431\u043e\u043b\u044c\u0448\u0435
-p1801
-aVherself
-p1802
-aV\u0432\u0434\u0440\u0443\u0433
-p1803
-aVsinuun
-p1804
-aVbei
-p1805
-aVben
-p1806
-aV\u0432\u0441\u0435\u0445
-p1807
-aVnr
-p1808
-aVhouvemos
-p1809
-aVseinem
-p1810
-aVseinen
-p1811
-aVelg
-p1812
-aV\u0447\u0442\u043e\u0431
-p1813
-aV\u0431\u0435\u0437
-p1814
-aVheit
-p1815
-aVeusses
-p1816
-aVblir
-p1817
-aVhave
-p1818
-aVseiner
-p1819
-aVseines
-p1820
-aVmij
-p1821
-aV\u043d\u0435\u0433\u043e
-p1822
-aVmio
-p1823
-aVmin
-p1824
-aVmia
-p1825
-aVmie
-p1826
-aVwhrend
-p1827
-aVmig
-p1828
-aVisto
-p1829
-aVwhich
-p1830
-aVseront
-p1831
-aVmille
-p1832
-aVmir
-p1833
-aVmit
-p1834
-aVserons
-p1835
-aVteria
-p1836
-aV\u044d\u0442\u0443
-p1837
-aVeres
-p1838
-aVtlt
-p1839
-aVwho
-p1840
-aVdetta
-p1841
-aVnoina
-p1842
-aVestivera
-p1843
-aVmange
-p1844
-aVsedan
-p1845
-aVheiss
-p1846
-aVnas\u0131l
-p1847
-aVwhy
-p1848
-aVmedan
-p1849
-aVhouvessem
-p1850
-aVvret
-p1851
-aVdenn
-p1852
-aVtuot
-p1853
-aVkenelle
-p1854
-aVmuy
-p1855
-aVnagy
-p1856
-aVismt
-p1857
-aVniet
-p1858
-aVsoyez
-p1859
-aVhouvero
-p1860
-aVsdant
-p1861
-aVmoet
-p1862
-aVhade
-p1863
-aVshould
-p1864
-aVsdana
-p1865
-aV\u0442\u0435\u0431\u044f
-p1866
-aVforem
-p1867
-aVnoin
-p1868
-aVestar
-p1869
-aVayez
-p1870
-aVjoina
-p1871
-aVsinun
-p1872
-aVdeine
-p1873
-aVsinua
-p1874
-aVavete
-p1875
-aVwordt
-p1876
-aVestarais
-p1877
-aVsinut
-p1878
-aVviel
-p1879
-aVkeiner
-p1880
-aVkeines
-p1881
-aVket
-p1882
-aVestar
-p1883
-aVshe
-p1884
-aVkeinem
-p1885
-aVkeinen
-p1886
-aVaquelas
-p1887
-aVteramos
-p1888
-aVvrt
-p1889
-aVahol
-p1890
-aVvre
-p1891
-aVtengo
-p1892
-aVmiei
-p1893
-aVsehr
-p1894
-aVhabrais
-p1895
-aVfacciano
-p1896
-aVfuese
-p1897
-aVtuvieron
-p1898
-aVnossas
-p1899
-aVlui
-p1900
-aVjos
-p1901
-aVhaja
-p1902
-aVfaremo
-p1903
-aVavuti
-p1904
-aVtambin
-p1905
-aVavuto
-p1906
-aVkez
-p1907
-aVavuta
-p1908
-aVavute
-p1909
-aV\u0442\u0435\u043c
-p1910
-aVett
-p1911
-aV
-p1912
-aVjoka
-p1913
-aVszerint
-p1914
-aVci
-p1915
-aV\u043d\u0435\u0442
-p1916
-aVsit
-p1917
-aVtes
-p1918
-aVce
-p1919
-aVhouver
-p1920
-aV\u0436\u0435
-p1921
-aVblive
-p1922
-aVniihin
-p1923
-aVtiene
-p1924
-aVsjlv
-p1925
-aV\u0442\u043e\u0442
-p1926
-aVfarai
-p1927
-aVests
-p1928
-aVmg
-p1929
-aVestn
-p1930
-aVai
-p1931
-aVtenidos
-p1932
-aVestava
-p1933
-aVigen
-p1934
-aV\u0443\u0436
-p1935
-aVestis
-p1936
-aVseras
-p1937
-aVmeus
-p1938
-aVsiamo
-p1939
-aVile
-p1940
-aV\u0431\u044b\u043b\u0430
-p1941
-aVours
-p1942
-aVvort
-p1943
-aVott
-p1944
-aVestuvimos
-p1945
-aVfacessero
-p1946
-aVhubo
-p1947
-aVtivssemos
-p1948
-aVinn
-p1949
-aVhube
-p1950
-aVwill
-p1951
-aVestiver
-p1952
-aVkeilt
-p1953
-aVniiksi
-p1954
-aVvilkas
-p1955
-aVnr
-p1956
-aV\u043d\u0435\u0439
-p1957
-aVat
-p1958
-aVkuka
-p1959
-aVnous
-p1960
-aVve
-p1961
-aVvi
-p1962
-aV\u0442\u043e\u043b\u044c\u043a\u043e
-p1963
-aVnincs
-p1964
-aVsitt
-p1965
-aV\u0435\u043c\u0443
-p1966
-aVsar
-p1967
-aVsar
-p1968
-aVwelche
-p1969
-aVnada
-p1970
-aVmik
-p1971
-aVnist
-p1972
-aV\u0441\u0432\u043e\u044e
-p1973
-aVveel
-p1974
-aVessendo
-p1975
-aVpelos
-p1976
-aVestado
-p1977
-aVsugli
-p1978
-aVoff
-p1979
-aVi
-p1980
-aVavevate
-p1981
-aVestuvisemos
-p1982
-aVcon
-p1983
-aVlesz
-p1984
-aVmeiss
-p1985
-aVweg
-p1986
-aVok
-p1987
-aVestaramos
-p1988
-aVhabramos
-p1989
-aVhouvermos
-p1990
-aVdisse
-p1991
-aVutan
-p1992
-aVdess
-p1993
-aVjonka
-p1994
-aVayant
-p1995
-aVmit
-p1996
-aVhouvesse
-p1997
-aVsokat
-p1998
-aVemme
-p1999
-aVhubimos
-p2000
-aV\u0435\u0439
-p2001
-aV\u0435\u0435
-p2002
-aVteremos
-p2003
-aVbecause
-p2004
-aVmihin
-p2005
-aVotras
-p2006
-aV\u043d\u0430\u043a\u043e\u043d\u0435\u0446
-p2007
-aVwerden
-p2008
-aVjotka
-p2009
-aVcsak
-p2010
-aVest
-p2011
-aVdagl
-p2012
-aVfanno
-p2013
-aVdoen
-p2014
-aVese
-p2015
-aVdoes
-p2016
-aVesa
-p2017
-aVeso
-p2018
-aVdesde
-p2019
-aVteihin
-p2020
-aVmitt
-p2021
-aV\u043b\u0438
-p2022
-aVfordi
-p2023
-aVayons
-p2024
-aVhoss
-p2025
-aVniille
-p2026
-aVabout
-p2027
-aVtambm
-p2028
-aVanden
-p2029
-aVestbamos
-p2030
-aVonder
-p2031
-aVander
-p2032
-aVhier
-p2033
-aVeinig
-p2034
-aVem
-p2035
-aVown
-p2036
-aVsajt
-p2037
-aVstanno
-p2038
-aVmert
-p2039
-aVpi
-p2040
-aVdonde
-p2041
-aVeine
-p2042
-aVstaremo
-p2043
-aVvai
-p2044
-aVvan
-p2045
-aVeens
-p2046
-aVeivt
-p2047
-aVayantes
-p2048
-aVvad
-p2049
-aVquante
-p2050
-aVeussions
-p2051
-aVquanta
-p2052
-aVvar
-p2053
-aVquanto
-p2054
-aVquanti
-p2055
-aVazzal
-p2056
-aVfaranno
-p2057
-aVbut
-p2058
-aVkeness
-p2059
-aVho
-p2060
-aVha
-p2061
-aVhe
-p2062
-aVezrt
-p2063
-aVnha
-p2064
-aVdagli
-p2065
-aVhouvramos
-p2066
-aVj
-p2067
-aVbelow
-p2068
-aVfueseis
-p2069
-aVsein
-p2070
-aVhvad
-p2071
-aVvuestros
-p2072
-aVamelyek
-p2073
-aVins
-p2074
-aVvostri
-p2075
-aVamelyet
-p2076
-aVvostro
-p2077
-aVind
-p2078
-aVvostra
-p2079
-aVvostre
-p2080
-aVarra
-p2081
-aVtendrn
-p2082
-aVdeires
-p2083
-aVsiano
-p2084
-aVother
-p2085
-aVseus
-p2086
-aVsich
-p2087
-aV\u0435\u0449\u0435
-p2088
-aVsarete
-p2089
-aVogs
-p2090
-aVutn
-p2091
-aVagli
-p2092
-aVhubieron
-p2093
-aV\u043f\u043e\u0442\u043e\u043c
-p2094
-aVh
-p2095
-aVstai
-p2096
-aVvuestras
-p2097
-aVmist
-p2098
-atp2099
-Rp2100
-.
\ No newline at end of file
diff --git a/shorttext/utils/textpreprocessing.py b/shorttext/utils/textpreprocessing.py
deleted file mode 100644
index 6699001a..00000000
--- a/shorttext/utils/textpreprocessing.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import re
-import pickle
-import os
-
-import spacy
-from stemming.porter import stem
-
-# load stop words
-this_dir, _ = os.path.split(__file__)
-stopwordset = pickle.load(open(os.path.join(this_dir, 'stopwordset.pkl'), 'r'))
-
-# initialize spacy
-class SpaCyNLPHolder:
- def __init__(self):
- self.nlp = None
-
- def getNLPInstance(self):
- if self.nlp==None:
- self.nlp = spacy.load('en')
- return self.nlp
-# prepare the singleton
-spaCyNLPHolder = SpaCyNLPHolder()
-
-def spacy_tokenize(text):
- """ Tokenize a sentence with spaCy.
-
- This works like `nltk.tokenize` which tokenize a sentence, but this runs faster.
- This returns the strings of tokens.
-
- :param text: sentence to tokenize
- :return: list of tokens
- :type text: str
- :rtype: list
- """
- nlp = spaCyNLPHolder.getNLPInstance() # lazy loading
- tokenizer = nlp(unicode(text))
- return map(str, [token for token in tokenizer])
-
-def preprocess_text(text, pipeline):
- """ Preprocess the text according to the given pipeline.
-
- Given the pipeline, which is a list of functions that process an
- input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
- preprocess the text.
-
- :param text: text to be preprocessed
- :param pipeline: a list of functions that convert a text to another text
- :return: preprocessed text
- :type text: str
- :type pipeline: list
- :rtype: str
- """
- if len(pipeline)==0:
- return text
- else:
- return preprocess_text(pipeline[0](text), pipeline[1:])
-
-def text_preprocessor(pipeline):
- """ Return the function that preprocesses text according to the pipeline.
-
- Given the pipeline, which is a list of functions that process an
- input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
- return a function that preprocesses an input text outlined by the pipeline, essentially
- a function that runs :func:`~preprocess_text` with the specified pipeline.
-
- :param pipeline: a list of functions that convert a text to another text
- :return: a function that preprocesses text according to the pipeline
- :type pipeline: list
- :rtype: function
- """
- return lambda text: preprocess_text(text, pipeline)
-
-def standard_text_preprocessor_1():
- """ Return a commonly used text preprocessor.
-
- Return a text preprocessor that is commonly used, with the following steps:
-
- - removing special characters,
- - removing numerals,
- - converting all alphabets to lower cases,
- - removing stop words, and
- - stemming the words (using Porter stemmer).
-
- This function calls :func:`~text_preprocessor`.
-
- :return: a function that preprocesses text according to the pipeline
- :rtype: function
- """
- pipeline = [lambda s: re.sub('[^\w\s]', '', s),
- lambda s: re.sub('[\d]', '', s),
- lambda s: s.lower(),
- lambda s: ' '.join(filter(lambda s: not (s in stopwordset), spacy_tokenize(s))),
- lambda s: ' '.join(map(stem, spacy_tokenize(s)))
- ]
- return text_preprocessor(pipeline)
\ No newline at end of file
diff --git a/shorttext/utils/wordembed.py b/shorttext/utils/wordembed.py
deleted file mode 100644
index 6b865814..00000000
--- a/shorttext/utils/wordembed.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import gensim
-
-def load_word2vec_model(path, binary=True):
- """ Load a pre-trained Word2Vec model.
-
- :param path: path of the file of the pre-trained Word2Vec model
- :param binary: whether the file is in binary format (Default: True)
- :return: a pre-trained Word2Vec model
- :type path: str
- :type binary: bool
- :rtype: gensim.models.keyedvectors.KeyedVectors
- """
- return gensim.models.KeyedVectors.load_word2vec_format(path, binary=binary)
-
diff --git a/shorttext_tests.py b/shorttext_tests.py
deleted file mode 100644
index a8601ae8..00000000
--- a/shorttext_tests.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import unittest
-
-class SampleTest(unittest.TestCase):
- def setUp(self):
- self.sample_var = True
-
- def testSampleTestCase(self):
- self.assertEqual(True, self.sample_var)
-
-if __name__ == '__main__':
- unittest.main()
\ No newline at end of file
diff --git a/src/shorttext/__init__.py b/src/shorttext/__init__.py
new file mode 100644
index 00000000..331cc8b3
--- /dev/null
+++ b/src/shorttext/__init__.py
@@ -0,0 +1,8 @@
+
+from . import metrics
+from . import classifiers
+from . import data
+from . import generators
+from . import spell
+from . import stack
+from . import utils
diff --git a/shorttext/classifiers/__init__.py b/src/shorttext/classifiers/__init__.py
similarity index 80%
rename from shorttext/classifiers/__init__.py
rename to src/shorttext/classifiers/__init__.py
index 653f8a05..00ba75be 100644
--- a/shorttext/classifiers/__init__.py
+++ b/src/shorttext/classifiers/__init__.py
@@ -4,10 +4,6 @@
from .embed import frameworks
from .embed.sumvec import frameworks as sumvecframeworks
-from .bow.topic.LatentTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
-from .bow.topic.LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topic
-from .bow.topic.LatentTopicModeling import load_gensimtopicmodel
-
from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier
from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier
from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier
diff --git a/shorttext/classifiers/bow/__init__.py b/src/shorttext/classifiers/bow/__init__.py
similarity index 100%
rename from shorttext/classifiers/bow/__init__.py
rename to src/shorttext/classifiers/bow/__init__.py
diff --git a/shorttext/classifiers/bow/maxent/MaxEntClassification.py b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py
similarity index 85%
rename from shorttext/classifiers/bow/maxent/MaxEntClassification.py
rename to src/shorttext/classifiers/bow/maxent/MaxEntClassification.py
index dcbab243..1db45fd2 100644
--- a/shorttext/classifiers/bow/maxent/MaxEntClassification.py
+++ b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py
@@ -3,16 +3,15 @@
from scipy.sparse import dok_matrix
from gensim.corpora import Dictionary
-from keras.models import Sequential
-from keras.layers import Dense
-from keras.regularizers import l2
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.regularizers import l2
-import shorttext.utils.kerasmodel_io as kerasio
-from shorttext.utils import tokenize
-from shorttext.utils import gensim_corpora as gc
-from shorttext.utils import classification_exceptions as e
-import shorttext.utils.compactmodel_io as cio
-from shorttext.utils import deprecated
+from ....utils import kerasmodel_io as kerasio
+from ....utils import tokenize
+from ....utils import gensim_corpora as gc
+from ....utils import classification_exceptions as e
+from ....utils.compactmodel_io import CompactIOMachine
def logistic_framework(nb_features, nb_outputs, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'):
@@ -45,8 +44,7 @@ def logistic_framework(nb_features, nb_outputs, l2reg=0.01, bias_l2reg=0.01, opt
return kmodel
-@cio.compactio({'classifier': 'maxent'}, 'maxent', ['_classlabels.txt', '.json', '.h5', '_labelidx.pkl', '_dictionary.dict'])
-class MaxEntClassifier:
+class MaxEntClassifier(CompactIOMachine):
"""
This is a classifier that implements the principle of maximum entropy.
@@ -59,6 +57,10 @@ def __init__(self, preprocessor=lambda s: s.lower()):
:param preprocessor: text preprocessor
:type preprocessor: function
"""
+ CompactIOMachine.__init__(self,
+ {'classifier': 'maxent'},
+ 'maxent',
+ ['_classlabels.txt', '.json', '.weights.h5', '_labelidx.pkl', '_dictionary.dict'])
self.preprocessor = preprocessor
self.trained = False
@@ -86,22 +88,6 @@ def shorttext_to_vec(self, shorttext):
return vec[0, :]
- @deprecated
- def gensimcorpus_to_matrix(self, corpus):
- """ Convert the gensim corpus into a sparse matrix. (deprecated)
-
- :param corpus: gensim corpus
- :return: matrix representing the corpus
- :type corpus: list
- :rtype: scipy.sparse.dok_matrix
- """
- # not used, deprecated
- matrix = dok_matrix((len(corpus), len(self.dictionary)))
- for docid, doc in enumerate(corpus):
- for tokenid, count in doc:
- matrix[docid, tokenid] = count
- return matrix
-
def index_classlabels(self):
""" Index the class outcome labels.
@@ -118,7 +104,7 @@ def convert_classdict_to_XY(self, classdict):
:type classdict: dict
:rtype: tuple
"""
- nb_data = sum(map(lambda k: len(classdict[k]), classdict.keys()))
+ nb_data = sum([len(classdict[k]) for k in classdict])
X = dok_matrix((nb_data, len(self.dictionary)))
y = dok_matrix((nb_data, len(self.labels2idx)))
@@ -127,7 +113,6 @@ def convert_classdict_to_XY(self, classdict):
if label in self.labels2idx.keys():
for shorttext in classdict[label]:
tokens = tokenize(self.preprocessor(shorttext))
- #X[rowid, :] = self.shorttext_to_vec(shorttext)
for token in tokens:
X[rowid, self.dictionary.token2id[token]] += 1.0
y[rowid, self.labels2idx[label]] = 1.
@@ -173,7 +158,7 @@ def savemodel(self, nameprefix):
Given the prefix of the file paths, save the model into files, with name given by the prefix.
There will be give files produced, one name ending with "_classlabels.txt", one with ".json",
- one with ".h5", one with "_labelidx.pkl", and one with "_dictionary.dict".
+ one with ".weights.h5", one with "_labelidx.pkl", and one with "_dictionary.dict".
If there is no trained model, a `ModelNotTrainedException` will be thrown.
@@ -193,13 +178,13 @@ def savemodel(self, nameprefix):
labelfile.write('\n'.join(self.classlabels))
labelfile.close()
- pickle.dump(self.labels2idx, open(nameprefix+'_labelidx.pkl', 'w'))
+ pickle.dump(self.labels2idx, open(nameprefix+'_labelidx.pkl', 'wb'))
def loadmodel(self, nameprefix):
""" Load a trained model from files.
Given the prefix of the file paths, load the model from files with name given by the prefix
- followed by "_classlabels.txt", ".json", ".h5", "_labelidx.pkl", and "_dictionary.dict".
+ followed by "_classlabels.txt", ".json", ".weights.h5", "_labelidx.pkl", and "_dictionary.dict".
If this has not been run, or a model was not trained by :func:`~train`,
a `ModelNotTrainedException` will be raised while performing prediction or saving the model.
@@ -213,11 +198,10 @@ def loadmodel(self, nameprefix):
self.dictionary = Dictionary.load(nameprefix+'_dictionary.dict')
labelfile = open(nameprefix+'_classlabels.txt', 'r')
- self.classlabels = labelfile.readlines()
+ self.classlabels = [s.strip() for s in labelfile.readlines()]
labelfile.close()
- self.classlabels = map(lambda s: s.strip(), self.classlabels)
- self.labels2idx = pickle.load(open(nameprefix+'_labelidx.pkl', 'r'))
+ self.labels2idx = pickle.load(open(nameprefix+'_labelidx.pkl', 'rb'))
self.trained = True
@@ -245,6 +229,7 @@ def score(self, shorttext):
scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)}
return scoredict
+
def load_maxent_classifier(name, compact=True):
""" Load the maximum entropy classifier from saved model.
diff --git a/shorttext/classifiers/bow/maxent/__init__.py b/src/shorttext/classifiers/bow/maxent/__init__.py
similarity index 100%
rename from shorttext/classifiers/bow/maxent/__init__.py
rename to src/shorttext/classifiers/bow/maxent/__init__.py
diff --git a/shorttext/classifiers/bow/topic/SkLearnClassification.py b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py
similarity index 92%
rename from shorttext/classifiers/bow/topic/SkLearnClassification.py
rename to src/shorttext/classifiers/bow/topic/SkLearnClassification.py
index 7b66c505..a069eaba 100644
--- a/shorttext/classifiers/bow/topic/SkLearnClassification.py
+++ b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py
@@ -1,13 +1,13 @@
-from collections import defaultdict
-from sklearn.externals import joblib
+import os
-from shorttext.utils import textpreprocessing as textpreprocess
-from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel
-from .LatentTopicModeling import LDAModeler, LSIModeler, RPModeler
-from .LatentTopicModeling import load_gensimtopicmodel
-import shorttext.utils.classification_exceptions as e
-import shorttext.utils.compactmodel_io as cio
+import joblib
+
+from ....utils import textpreprocessing as textpreprocess
+from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel
+from ....generators import LDAModeler, LSIModeler, RPModeler, AutoencodingTopicModeler
+from ....utils import classification_exceptions as e
+from ....utils import compactmodel_io as cio
class TopicVectorSkLearnClassifier:
@@ -52,9 +52,9 @@ def train(self, classdict, *args, **kwargs):
"""
X = []
y = []
- self.classlabels = classdict.keys()
+ self.classlabels = sorted(classdict.keys()) # classlabels must be sorted like the topic modelers
for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels):
- topicvecs = map(self.topicmodeler.retrieve_topicvec, classdict[classlabel])
+ topicvecs = [self.topicmodeler.retrieve_topicvec(topic) for topic in classdict[classlabel]]
X += topicvecs
y += [classidx]*len(topicvecs)
self.classifier.fit(X, y, *args, **kwargs)
@@ -92,7 +92,7 @@ def classify(self, shorttext):
topicvec = self.getvector(shorttext)
return self.classlabels[self.classifier.predict([topicvec])[0]]
- def score(self, shorttext, default_score=0.0):
+ def score(self, shorttext):
""" Calculate the score, which is the cosine similarity with the topic vector of the model,
of the short text against each class labels.
@@ -100,20 +100,18 @@ def score(self, shorttext, default_score=0.0):
topic model was not trained, it will raise `ModelNotTrainedException`.
:param shorttext: short text
- :param default_score: default score if no score is assigned (Default: 0.0)
:return: dictionary of scores of the text to all classes
:raise: ModelNotTrainedException
:type shorttext: str
- :type default_score: float
:rtype: dict
"""
if not self.trained:
raise e.ModelNotTrainedException()
- scoredict = defaultdict(lambda : default_score)
+
topicvec = self.getvector(shorttext)
- for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels):
- scoredict[classlabel] = self.classifier.score([topicvec], [classidx])
- return dict(scoredict)
+ scoredict = {classlabel: self.classifier.score([topicvec], [classidx])
+ for classidx, classlabel in enumerate(self.classlabels)}
+ return scoredict
def savemodel(self, nameprefix):
""" Save the model.
@@ -134,6 +132,9 @@ def savemodel(self, nameprefix):
raise e.ModelNotTrainedException()
self.topicmodeler.savemodel(nameprefix)
joblib.dump(self.classifier, nameprefix+'.pkl')
+ labelfile = open(nameprefix+'_classlabels.txt', 'w')
+ labelfile.write('\n'.join(self.classlabels))
+ labelfile.close()
def loadmodel(self, nameprefix):
""" Load the classification model together with the topic model.
@@ -144,7 +145,13 @@ def loadmodel(self, nameprefix):
"""
self.topicmodeler.loadmodel(nameprefix)
self.classifier = joblib.load(nameprefix+'.pkl')
- self.classlabels = self.topicmodeler.classlabels
+ # for backward compatibility, shorttext<1.0.0 does not have _classlabels.txt
+ if os.path.exists(nameprefix+'_classlabels.txt'):
+ labelfile = open(nameprefix+'_classlabels.txt', 'r')
+ self.classlabels = [s.strip() for s in labelfile.readlines()]
+ labelfile.close()
+ else:
+ self.classlabels = self.topicmodeler.classlabels
def save_compact_model(self, name):
""" Save the model.
@@ -160,7 +167,7 @@ def save_compact_model(self, name):
"""
topicmodel_info = self.topicmodeler.get_info()
cio.save_compact_model(name, self.savemodel, 'topic_sklearn',
- topicmodel_info['suffices']+['.pkl'],
+ topicmodel_info['suffices']+['.pkl', '_classlabels.txt'],
{'classifier': 'topic_sklearn', 'topicmodel': topicmodel_info['classifier']})
def load_compact_model(self, name):
@@ -174,6 +181,7 @@ def load_compact_model(self, name):
{'classifier': 'topic_sklearn', 'topicmodel': None})
self.trained = True
+
def train_gensim_topicvec_sklearnclassifier(classdict,
nb_topics,
sklearn_classifier,
@@ -233,6 +241,7 @@ def train_gensim_topicvec_sklearnclassifier(classdict,
return classifier
+
def load_gensim_topicvec_sklearnclassifier(name,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
compact=True):
@@ -282,6 +291,7 @@ def load_gensim_topicvec_sklearnclassifier(name,
# return the instance
return classifier
+
def train_autoencoder_topic_sklearnclassifier(classdict,
nb_topics,
sklearn_classifier,
@@ -330,6 +340,7 @@ def train_autoencoder_topic_sklearnclassifier(classdict,
return classifier
+
def load_autoencoder_topic_sklearnclassifier(name,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
compact=True):
@@ -375,4 +386,4 @@ def load_autoencoder_topic_sklearnclassifier(name,
classifier.trained = True
# return the instance
- return classifier
\ No newline at end of file
+ return classifier
diff --git a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py
similarity index 96%
rename from shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py
rename to src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py
index 3e47f838..bf429af4 100644
--- a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py
+++ b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py
@@ -1,8 +1,7 @@
-from shorttext.utils import textpreprocessing as textpreprocess
-from .LatentTopicModeling import LatentTopicModeler, GensimTopicModeler
-from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel
-from .LatentTopicModeling import load_gensimtopicmodel
+from ....utils import textpreprocessing as textpreprocess
+from ....generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler
+from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel
class TopicVecCosineDistanceClassifier:
@@ -74,6 +73,7 @@ def load_compact_model(self, name):
def save_compact_model(self, name):
self.topicmodeler.save_compact_model(name)
+
def train_gensimtopicvec_cosineClassifier(classdict,
nb_topics,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
@@ -111,6 +111,7 @@ def train_gensimtopicvec_cosineClassifier(classdict,
# cosine distance classifier
return TopicVecCosineDistanceClassifier(topicmodeler)
+
def load_gensimtopicvec_cosineClassifier(name,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
compact=True):
@@ -134,6 +135,7 @@ def load_gensimtopicvec_cosineClassifier(name,
topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor, compact=compact)
return TopicVecCosineDistanceClassifier(topicmodeler)
+
def train_autoencoder_cosineClassifier(classdict,
nb_topics,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
@@ -162,13 +164,14 @@ def train_autoencoder_cosineClassifier(classdict,
# cosine distance classifier
return TopicVecCosineDistanceClassifier(autoencoder)
+
def load_autoencoder_cosineClassifier(name,
preprocessor=textpreprocess.standard_text_preprocessor_1(),
compact=True):
""" Load an autoencoder from files for topic modeling, and return a cosine classifier.
Given the prefix of the file paths, load the model into files, with name given by the prefix.
- There are files with names ending with "_encoder.json" and "_encoder.h5", which are
+ There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are
the JSON and HDF5 files for the encoder respectively.
They also include a gensim dictionary (.gensimdict).
@@ -182,4 +185,4 @@ def load_autoencoder_cosineClassifier(name,
:rtype: TopicVecCosineDistanceClassifier
"""
autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor, compact=compact)
- return TopicVecCosineDistanceClassifier(autoencoder)
\ No newline at end of file
+ return TopicVecCosineDistanceClassifier(autoencoder)
diff --git a/shorttext/classifiers/bow/topic/__init__.py b/src/shorttext/classifiers/bow/topic/__init__.py
similarity index 70%
rename from shorttext/classifiers/bow/topic/__init__.py
rename to src/shorttext/classifiers/bow/topic/__init__.py
index af52dc94..6467258f 100644
--- a/shorttext/classifiers/bow/topic/__init__.py
+++ b/src/shorttext/classifiers/bow/topic/__init__.py
@@ -1,3 +1,3 @@
-from . import LatentTopicModeling
+
from . import TopicVectorDistanceClassification
from . import SkLearnClassification
\ No newline at end of file
diff --git a/shorttext/classifiers/embed/__init__.py b/src/shorttext/classifiers/embed/__init__.py
similarity index 100%
rename from shorttext/classifiers/embed/__init__.py
rename to src/shorttext/classifiers/embed/__init__.py
diff --git a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
similarity index 63%
rename from shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
rename to src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
index 220d85af..bf228f78 100644
--- a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
+++ b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
@@ -1,19 +1,19 @@
import json
import os
+import warnings
import numpy as np
-from keras.preprocessing.text import Tokenizer
-from keras.preprocessing.sequence import pad_sequences
+import pandas as pd
-import shorttext.utils.kerasmodel_io as kerasio
-import shorttext.utils.classification_exceptions as e
-from shorttext.utils import tokenize
-import shorttext.utils.compactmodel_io as cio
+from ....utils import kerasmodel_io as kerasio
+from ....utils import classification_exceptions as e
+from ....utils import tokenize
+from ....utils.compactmodel_io import CompactIOMachine
+from typing import Union, List, Dict, Any
-@cio.compactio({'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.h5', '_config.json'])
-class VarNNEmbeddedVecClassifier:
+class VarNNEmbeddedVecClassifier(CompactIOMachine):
"""
This is a wrapper for various neural network algorithms
for supervised short text categorization.
@@ -30,57 +30,22 @@ class VarNNEmbeddedVecClassifier:
A pre-trained Google Word2Vec model can be downloaded `here
`_.
-
- Examples
-
- >>> import shorttext
- >>> # load the Word2Vec model
- >>> wvmodel = shorttext.utils.load_word2vec_model('GoogleNews-vectors-negative300.bin.gz', binary=True)
- >>>
- >>> # load the training data
- >>> trainclassdict = shorttext.data.subjectkeywords()
- >>>
- >>> # initialize the classifier and train
- >>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys())) # using convolutional neural network model
- >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
- >>> classifier.train(trainclassdict, kmodel)
- Epoch 1/10
- 45/45 [==============================] - 0s - loss: 1.0578
- Epoch 2/10
- 45/45 [==============================] - 0s - loss: 0.5536
- Epoch 3/10
- 45/45 [==============================] - 0s - loss: 0.3437
- Epoch 4/10
- 45/45 [==============================] - 0s - loss: 0.2282
- Epoch 5/10
- 45/45 [==============================] - 0s - loss: 0.1658
- Epoch 6/10
- 45/45 [==============================] - 0s - loss: 0.1273
- Epoch 7/10
- 45/45 [==============================] - 0s - loss: 0.1052
- Epoch 8/10
- 45/45 [==============================] - 0s - loss: 0.0961
- Epoch 9/10
- 45/45 [==============================] - 0s - loss: 0.0839
- Epoch 10/10
- 45/45 [==============================] - 0s - loss: 0.0743
- >>> classifier.score('artificial intelligence')
- {'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325}
"""
- def __init__(self, wvmodel, vecsize=100, maxlen=15, with_gensim=False):
+ def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:param maxlen: maximum number of words in a sentence (Default: 15)
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type vecsize: int
:type maxlen: int
"""
+ CompactIOMachine.__init__(self, {'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.weights.h5', '_config.json'])
self.wvmodel = wvmodel
- self.vecsize = vecsize
+ self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
self.maxlen = maxlen
- self.with_gensim = with_gensim
+ self.with_gensim = False if not with_gensim else with_gensim
self.trained = False
def convert_trainingdata_matrix(self, classdict):
@@ -106,20 +71,14 @@ def convert_trainingdata_matrix(self, classdict):
category_bucket = [0]*len(classlabels)
category_bucket[lblidx_dict[label]] = 1
indices.append(category_bucket)
- if self.with_gensim:
- phrases.append(shorttext)
- else:
- phrases.append(tokenize(shorttext))
-
- if self.with_gensim:
- return classlabels, phrases, indices
+ phrases.append(tokenize(shorttext))
# store embedded vectors
train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize))
for i in range(len(phrases)):
for j in range(min(self.maxlen, len(phrases[i]))):
train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j])
- indices = np.array(indices, dtype=np.int)
+ indices = np.array(indices, dtype=np.int_)
return classlabels, train_embedvec, indices
@@ -139,23 +98,11 @@ def train(self, classdict, kerasmodel, nb_epoch=10):
:type kerasmodel: keras.models.Sequential
:type nb_epoch: int
"""
- if self.with_gensim:
- # convert classdict to training input vectors
- self.classlabels, x_train, y_train = self.convert_trainingdata_matrix(classdict)
+ # convert classdict to training input vectors
+ self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict)
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(x_train)
- x_train = tokenizer.texts_to_sequences(x_train)
- x_train = pad_sequences(x_train, maxlen=self.maxlen)
-
- # train the model
- kerasmodel.fit(x_train, y_train, epochs=nb_epoch)
- else:
- # convert classdict to training input vectors
- self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict)
-
- # train the model
- kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch)
+ # train the model
+ kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch)
# flag switch
self.model = kerasmodel
@@ -166,7 +113,7 @@ def savemodel(self, nameprefix):
Given the prefix of the file paths, save the model into files, with name given by the prefix.
There will be three files produced, one name ending with "_classlabels.txt", one name
- ending with ".json", and one name ending with ".h5". For shorttext>=0.4.0, another file
+ ending with ".json", and one name ending with ".weights.h5". For shorttext>=0.4.0, another file
with extension "_config.json" would be created.
If there is no trained model, a `ModelNotTrainedException` will be thrown.
@@ -182,13 +129,14 @@ def savemodel(self, nameprefix):
labelfile = open(nameprefix+'_classlabels.txt', 'w')
labelfile.write('\n'.join(self.classlabels))
labelfile.close()
- json.dump({'with_gensim': self.with_gensim}, open(nameprefix+'_config.json', 'w'))
+ json.dump({'with_gensim': False, 'maxlen': self.maxlen, 'vecsize': self.vecsize},
+ open(nameprefix+'_config.json', 'w'))
def loadmodel(self, nameprefix):
""" Load a trained model from files.
Given the prefix of the file paths, load the model from files with name given by the prefix
- followed by "_classlabels.txt", ".json" and ".h5". For shorttext>=0.4.0, a file with
+ followed by "_classlabels.txt", ".json" and ".weights.h5". For shorttext>=0.4.0, a file with
extension "_config.json" would also be used.
If this has not been run, or a model was not trained by :func:`~train`,
@@ -202,13 +150,32 @@ def loadmodel(self, nameprefix):
labelfile = open(nameprefix+'_classlabels.txt', 'r')
self.classlabels = labelfile.readlines()
labelfile.close()
- self.classlabels = map(lambda s: s.strip(), self.classlabels)
+ self.classlabels = [s.strip() for s in self.classlabels]
+
# check if _config.json exists.
# This file does not exist if the model was created with shorttext<0.4.0
if os.path.exists(nameprefix+'_config.json'):
- self.with_gensim = json.load(open(nameprefix+'_config.json', 'r'))['with_gensim']
+ config = json.load(open(nameprefix+'_config.json', 'r'))
+ # these fields are present for release >= 1.0.0
+ if 'maxlen' in config:
+ self.maxlen = config['maxlen']
+ else:
+ self.maxlen = 15
+ if 'vecsize' in config:
+ self.vecsize = config['vecsize']
+ else:
+ self.vecsize = self.wvmodel.vector_size
+ if self.vecsize != self.wvmodel.vector_size:
+ warnings.warn('Record vector size (%i) is not the same as that of the given word-embedding model (%i)! ' % (self.vecsize, self.wvmodel.vector_size)+
+ 'Setting the vector size to be %i, but there may be run time error!' % (self.wvmodel.vector_size),
+ RuntimeWarning)
+ self.vecsize = self.wvmodel.vector_size
else:
- self.with_gensim = False
+ self.maxlen = 15
+ self.vecsize = self.wvmodel.vector_size
+ warnings.warn('Model files from old versions.')
+
+ self.with_gensim = False
self.trained = True
def word_to_embedvec(self, word):
@@ -243,19 +210,7 @@ def shorttext_to_matrix(self, shorttext):
matrix[i] = self.word_to_embedvec(tokens[i])
return matrix
- def process_text(self, shorttext):
- """Process the input text by tokenizing and padding it.
-
- :param shorttext: a short sentence
- """
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(shorttext)
- x_train = tokenizer.texts_to_sequences(shorttext)
-
- x_train = pad_sequences(x_train, maxlen=self.maxlen)
- return x_train
-
- def score(self, shorttext):
+ def score(self, shorttexts: Union[str, List[str]], model_params: Dict[str, Any] = {}):
""" Calculate the scores for all the class labels for the given short sentence.
Given a short sentence, calculate the classification scores for all class labels,
@@ -264,44 +219,50 @@ def score(self, shorttext):
If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.
:param shorttext: a short sentence
+ :param model_params: additional parameters to be passed to the model object
:return: a dictionary with keys being the class labels, and values being the corresponding classification scores
:type shorttext: str
:rtype: dict
:raise: ModelNotTrainedException
"""
+ is_multiple = True
+ if isinstance(shorttexts, str):
+ is_multiple = False
+ shorttexts = [shorttexts]
+
if not self.trained:
raise e.ModelNotTrainedException()
- if self.with_gensim:
- # tokenize and pad input text
- matrix = self.process_text(shorttext)
- else:
- # retrieve vector
- matrix = np.array([self.shorttext_to_matrix(shorttext)])
+ # retrieve vector
+ matrix = np.array([self.shorttext_to_matrix(shorttext) for shorttext in shorttexts])
# classification using the neural network
- predictions = self.model.predict(matrix)
+ predictions = self.model.predict(matrix, **model_params)
# wrangle output result
- scoredict = {}
- for idx, classlabel in zip(range(len(self.classlabels)), self.classlabels):
- scoredict[classlabel] = predictions[0][idx]
+ df = pd.DataFrame(predictions, columns=self.classlabels)
+
+ if not is_multiple:
+ return df.to_dict('records')[0]
+
+ return df.to_dict('records')
- return scoredict
-def load_varnnlibvec_classifier(wvmodel, name, compact=True):
- """ Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
+def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=None):
+ """ Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained word-embedding model.
:param wvmodel: Word2Vec model
:param name: name (if compact=True) or prefix (if compact=False) of the file path
:param compact whether model file is compact (Default: True)
+ :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:return: the classifier
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type name: str
:type compact: bool
+ :type vecsize: int
:rtype: VarNNEmbeddedVecClassifier
"""
- classifier = VarNNEmbeddedVecClassifier(wvmodel)
+ classifier = VarNNEmbeddedVecClassifier(wvmodel, vecsize=vecsize)
if compact:
classifier.load_compact_model(name)
else:
diff --git a/shorttext/classifiers/embed/nnlib/__init__.py b/src/shorttext/classifiers/embed/nnlib/__init__.py
similarity index 100%
rename from shorttext/classifiers/embed/nnlib/__init__.py
rename to src/shorttext/classifiers/embed/nnlib/__init__.py
diff --git a/shorttext/classifiers/embed/nnlib/frameworks.py b/src/shorttext/classifiers/embed/nnlib/frameworks.py
similarity index 51%
rename from shorttext/classifiers/embed/nnlib/frameworks.py
rename to src/shorttext/classifiers/embed/nnlib/frameworks.py
index cbd10dda..3cdff618 100644
--- a/shorttext/classifiers/embed/nnlib/frameworks.py
+++ b/src/shorttext/classifiers/embed/nnlib/frameworks.py
@@ -1,11 +1,13 @@
-from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
-from keras.models import Sequential, Model
-from keras.regularizers import l2
-from keras.engine import Input
+
+from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Activation
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.regularizers import l2
+
# Codes were changed because of Keras.
# Keras 1 --> Keras 2: https://github.com/fchollet/keras/wiki/Keras-2.0-release-notes
+
# Paper: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," arXiv:1408.5882 (2014).
# ref: https://gist.github.com/entron/b9bc61a74e7cadeb1fec
# ref: http://cs231n.github.io/convolutional-networks/
@@ -14,13 +16,12 @@ def CNNWordEmbed(nb_labels,
nb_filters=1200,
n_gram=2,
maxlen=15,
- vecsize=100,
+ vecsize=300,
cnn_dropout=0.0,
final_activation='softmax',
dense_wl2reg=0.0,
dense_bl2reg=0.0,
- optimizer='adam',
- with_gensim=False):
+ optimizer='adam'):
""" Returns the convolutional neural network (CNN/ConvNet) for word-embedded vectors.
Reference: Yoon Kim, "Convolutional Neural Networks for Sentence Classification,"
@@ -32,13 +33,12 @@ def CNNWordEmbed(nb_labels,
:param nb_filters: number of filters (Default: 1200)
:param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2)
:param maxlen: maximum number of words in a sentence (Default: 15)
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: 300)
:param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0)
:param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax')
:param dense_wl2reg: L2 regularization coefficient (Default: 0.0)
:param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0)
:param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam)
- :param with_gensim: boolean variable to indicate if the word-embeddings being used derived from a Gensim's Word2Vec model. (Default: True)
:return: keras model (`Sequential` or`Model`) for CNN/ConvNet for Word-Embeddings
:type nb_labels: int
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
@@ -51,49 +51,28 @@ def CNNWordEmbed(nb_labels,
:type dense_wl2reg: float
:type dense_bl2reg: float
:type optimizer: str
- :type with_gensim: bool
- :rtype: keras.models.Sequential or keras.models.Model
+ :rtype: keras.models.Model
"""
- if with_gensim == True:
- embedding_layer = wvmodel.get_embedding_layer()
- sequence_input = Input(shape=(maxlen,), dtype='int32')
- x = embedding_layer(sequence_input)
- x = Conv1D(filters=nb_filters,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize))(x)
- if cnn_dropout > 0.0:
- x = Dropout(cnn_dropout)(x)
- x = MaxPooling1D(pool_size=maxlen - n_gram + 1)(x)
- x = Flatten()(x)
- x = Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg))(x)
+ if wvmodel != None:
+ vecsize = wvmodel.vector_size
- model = Model(sequence_input, x)
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
- else:
- model = Sequential()
- model.add(Conv1D(filters=nb_filters,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize)))
- if cnn_dropout > 0.0:
- model.add(Dropout(cnn_dropout))
- model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1))
- model.add(Flatten())
- model.add(Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg))
- )
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
+ model = Sequential()
+ model.add(Conv1D(filters=nb_filters,
+ kernel_size=n_gram,
+ padding='valid',
+ activation='relu',
+ input_shape=(maxlen, vecsize)))
+ if cnn_dropout > 0.0:
+ model.add(Dropout(cnn_dropout))
+ model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1))
+ model.add(Flatten())
+ model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg)))
+ model.add(Activation(final_activation))
+ model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
+
# two layers of CNN, maxpooling, dense
def DoubleCNNWordEmbed(nb_labels,
wvmodel=None,
@@ -102,14 +81,13 @@ def DoubleCNNWordEmbed(nb_labels,
n_gram=2,
filter_length_2=10,
maxlen=15,
- vecsize=100,
+ vecsize=300,
cnn_dropout_1=0.0,
cnn_dropout_2=0.0,
final_activation='softmax',
dense_wl2reg=0.0,
dense_bl2reg=0.0,
- optimizer='adam',
- with_gensim=False):
+ optimizer='adam'):
""" Returns the double-layered convolutional neural network (CNN/ConvNet) for word-embedded vectors.
:param nb_labels: number of class labels
@@ -119,7 +97,7 @@ def DoubleCNNWordEmbed(nb_labels,
:param n_gram: n-gram, or window size of first CNN/ConvNet (Default: 2)
:param filter_length_2: window size for second CNN/ConvNet layer (Default: 10)
:param maxlen: maximum number of words in a sentence (Default: 15)
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: 300)
:param cnn_dropout_1: dropout rate for the first CNN/ConvNet layer (Default: 0.0)
:param cnn_dropout_2: dropout rate for the second CNN/ConvNet layer (Default: 0.0)
:param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax')
@@ -141,61 +119,34 @@ def DoubleCNNWordEmbed(nb_labels,
:type dense_wl2reg: float
:type dense_bl2reg: float
:type optimizer: str
- :type with_gensim: bool
- :rtype: keras.models.Sequential or keras.models.Model
+ :rtype: keras.models.Model
"""
- if with_gensim == True:
- embedding_layer = wvmodel.get_embedding_layer()
- sequence_input = Input(shape=(maxlen,), dtype='int32')
- x = embedding_layer(sequence_input)
- x = Conv1D(filters=nb_filters_1,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize))(x)
- if cnn_dropout_1 > 0.0:
- x = Dropout(cnn_dropout_1)(x)
- x = Conv1D(filters=nb_filters_2,
- kernel_size=filter_length_2,
- padding='valid',
- activation='relu')(x)
- if cnn_dropout_2 > 0.0:
- x = Dropout(cnn_dropout_2)(x)
- x = MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1)(x)
- x = Flatten()(x)
- x = Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg))(x)
+ if wvmodel != None:
+ vecsize = wvmodel.vector_size
- model = Model(sequence_input, x)
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
- else:
- model = Sequential()
- model.add(Conv1D(filters=nb_filters_1,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize)))
- if cnn_dropout_1 > 0.0:
- model.add(Dropout(cnn_dropout_1))
- model.add(Conv1D(filters=nb_filters_2,
- kernel_size=filter_length_2,
- padding='valid',
- activation='relu'))
- if cnn_dropout_2 > 0.0:
- model.add(Dropout(cnn_dropout_2))
- model.add(MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1))
- model.add(Flatten())
- model.add(Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg))
- )
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
+ model = Sequential()
+ model.add(Conv1D(filters=nb_filters_1,
+ kernel_size=n_gram,
+ padding='valid',
+ activation='relu',
+ input_shape=(maxlen, vecsize)))
+ if cnn_dropout_1 > 0.0:
+ model.add(Dropout(cnn_dropout_1))
+ model.add(Conv1D(filters=nb_filters_2,
+ kernel_size=filter_length_2,
+ padding='valid',
+ activation='relu'))
+ if cnn_dropout_2 > 0.0:
+ model.add(Dropout(cnn_dropout_2))
+ model.add(MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1))
+ model.add(Flatten())
+ model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg)))
+ model.add(Activation(final_activation))
+ model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
+
# C-LSTM
# Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau,
# "A C-LSTM Neural Network for Text Classification", arXiv:1511.08630 (2015).
@@ -204,15 +155,14 @@ def CLSTMWordEmbed(nb_labels,
nb_filters=1200,
n_gram=2,
maxlen=15,
- vecsize=100,
+ vecsize=300,
cnn_dropout=0.0,
nb_rnnoutdim=1200,
rnn_dropout=0.2,
final_activation='softmax',
dense_wl2reg=0.0,
dense_bl2reg=0.0,
- optimizer='adam',
- with_gensim=False):
+ optimizer='adam'):
""" Returns the C-LSTM neural networks for word-embedded vectors.
Reference: Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau,
@@ -225,7 +175,7 @@ def CLSTMWordEmbed(nb_labels,
:param nb_filters: number of filters (Default: 1200)
:param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2)
:param maxlen: maximum number of words in a sentence (Default: 15)
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: 300)
:param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0)
:param nb_rnnoutdim: output dimension for the LSTM networks (Default: 1200)
:param rnn_dropout: dropout rate for LSTM (Default: 0.2)
@@ -247,50 +197,25 @@ def CLSTMWordEmbed(nb_labels,
:type dense_wl2reg: float
:type dense_bl2reg: float
:type optimizer: str
- :type with_gensim: bool
- :rtype: keras.models.Sequential or keras.models.Model
+ :rtype: keras.models.Model
"""
- if with_gensim == True:
- embedding_layer = wvmodel.get_embedding_layer()
- sequence_input = Input(shape=(maxlen,), dtype='int32')
- x = embedding_layer(sequence_input)
- x = Conv1D(filters=nb_filters,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize))(x)
- if cnn_dropout > 0.0:
- x = Dropout(cnn_dropout)(x)
- x = MaxPooling1D(pool_size=maxlen - n_gram + 1)(x)
- x = LSTM(nb_rnnoutdim)(x)
- if rnn_dropout > 0.0:
- x = Dropout(rnn_dropout)(x)
- x = Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg),)(x)
+ if wvmodel != None:
+ vecsize = wvmodel.vector_size
- model = Model(sequence_input, x)
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
- else:
- model = Sequential()
- model.add(Conv1D(filters=nb_filters,
- kernel_size=n_gram,
- padding='valid',
- activation='relu',
- input_shape=(maxlen, vecsize)))
- if cnn_dropout > 0.0:
- model.add(Dropout(cnn_dropout))
- model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1))
- model.add(LSTM(nb_rnnoutdim))
- if rnn_dropout > 0.0:
- model.add(Dropout(rnn_dropout))
- model.add(Dense(nb_labels,
- activation=final_activation,
- kernel_regularizer=l2(dense_wl2reg),
- bias_regularizer=l2(dense_bl2reg),
- )
- )
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
+ model = Sequential()
+ model.add(Conv1D(filters=nb_filters,
+ kernel_size=n_gram,
+ padding='valid',
+ activation='relu',
+ input_shape=(maxlen, vecsize)))
+ if cnn_dropout > 0.0:
+ model.add(Dropout(cnn_dropout))
+ model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1))
+ model.add(LSTM(nb_rnnoutdim))
+ if rnn_dropout > 0.0:
+ model.add(Dropout(rnn_dropout))
+ model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg)))
+ model.add(Activation(final_activation))
+ model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
diff --git a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
similarity index 80%
rename from shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
rename to src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
index 96aff3c0..6fbc22e3 100644
--- a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
+++ b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
@@ -4,13 +4,12 @@
import numpy as np
from scipy.spatial.distance import cosine
-import shorttext.utils.classification_exceptions as e
-from shorttext.utils import tokenize
-import shorttext.utils.compactmodel_io as cio
+from ....utils.classification_exceptions import ModelNotTrainedException
+from ....utils import shorttext_to_avgvec
+from ....utils.compactmodel_io import CompactIOMachine
-@cio.compactio({'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl'])
-class SumEmbeddedVecClassifier:
+class SumEmbeddedVecClassifier(CompactIOMachine):
"""
This is a supervised classification algorithm for short text categorization.
Each class label has a few short sentences, where each token is converted
@@ -23,18 +22,19 @@ class SumEmbeddedVecClassifier:
`_.
"""
- def __init__(self, wvmodel, vecsize=100, simfcn=lambda u, v: 1-cosine(u, v)):
+ def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:param simfcn: similarity function (Default: cosine similarity)
- :type wvmodel: gensim.models.word2vec.Word2Vec
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type vecsize: int
:type simfcn: function
"""
+ CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl'])
self.wvmodel = wvmodel
- self.vecsize = vecsize
+ self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
self.simfcn = simfcn
self.trained = False
@@ -51,8 +51,9 @@ def train(self, classdict):
"""
self.addvec = defaultdict(lambda : np.zeros(self.vecsize))
for classtype in classdict:
- for shorttext in classdict[classtype]:
- self.addvec[classtype] += self.shorttext_to_embedvec(shorttext)
+ self.addvec[classtype] = np.sum([self.shorttext_to_embedvec(shorttext)
+ for shorttext in classdict[classtype]],
+ axis=0)
self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype])
self.addvec = dict(self.addvec)
self.trained = True
@@ -70,8 +71,8 @@ def savemodel(self, nameprefix):
:raise: ModelNotTrainedException
"""
if not self.trained:
- raise e.ModelNotTrainedException()
- pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'w'))
+ raise ModelNotTrainedException()
+ pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'wb'))
def loadmodel(self, nameprefix):
""" Load a trained model from files.
@@ -86,7 +87,7 @@ def loadmodel(self, nameprefix):
:return: None
:type nameprefix: str
"""
- self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'r'))
+ self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'rb'))
self.trained = True
def shorttext_to_embedvec(self, shorttext):
@@ -102,14 +103,7 @@ def shorttext_to_embedvec(self, shorttext):
:type shorttext: str
:rtype: numpy.ndarray
"""
- vec = np.zeros(self.vecsize)
- for token in tokenize(shorttext):
- if token in self.wvmodel:
- vec += self.wvmodel[token]
- norm = np.linalg.norm(vec)
- if norm != 0:
- vec /= np.linalg.norm(vec)
- return vec
+ return shorttext_to_avgvec(shorttext, self.wvmodel)
def score(self, shorttext):
""" Calculate the scores for all the class labels for the given short sentence.
@@ -127,7 +121,7 @@ def score(self, shorttext):
:raise: ModelNotTrainedException
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
vec = self.shorttext_to_embedvec(shorttext)
scoredict = {}
for classtype in self.addvec:
@@ -137,19 +131,22 @@ def score(self, shorttext):
scoredict[classtype] = np.nan
return scoredict
-def load_sumword2vec_classifier(wvmodel, name, compact=True):
+
+def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None):
""" Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
:param wvmodel: Word2Vec model
:param name: name (if compact=True) or prefix (if compact=False) of the file path
:param compact whether model file is compact (Default: True)
+ :param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:return: the classifier
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type name: str
:type compact: bool
+ :type vecsize: int
:rtype: SumEmbeddedVecClassifier
"""
- classifier = SumEmbeddedVecClassifier(wvmodel)
+ classifier = SumEmbeddedVecClassifier(wvmodel, vecsize=vecsize)
if compact:
classifier.load_compact_model(name)
else:
diff --git a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py
similarity index 77%
rename from shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py
rename to src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py
index 6f216ef9..5a2af100 100644
--- a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py
+++ b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py
@@ -1,11 +1,13 @@
+
import numpy as np
-import shorttext.utils.kerasmodel_io as kerasio
-import shorttext.utils.classification_exceptions as e
-from shorttext.utils.textpreprocessing import spacy_tokenize
+from ....utils import kerasmodel_io as kerasio
+from ....utils.classification_exceptions import ModelNotTrainedException
+from ....utils.textpreprocessing import tokenize
+from ....utils.compactmodel_io import CompactIOMachine
-class VarNNSumEmbeddedVecClassifier:
+class VarNNSumEmbeddedVecClassifier(CompactIOMachine):
"""
This is a wrapper for various neural network algorithms
for supervised short text categorization.
@@ -24,25 +26,26 @@ class VarNNSumEmbeddedVecClassifier:
`_.
"""
- def __init__(self, wvmodel, vecsize=100, maxlen=15):
+ def __init__(self, wvmodel, vecsize=None, maxlen=15):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:param maxlen: maximum number of words in a sentence (Default: 15)
:type wvmodel: gensim.models.word2vec.Word2Vec
:type vecsize: int
:type maxlen: int
"""
+ CompactIOMachine.__init__(self, {'classifier': 'sumnnlibvec'}, 'sumnnlibvec', ['_classlabels.txt', '.json', '.weights.h5'])
self.wvmodel = wvmodel
- self.vecsize = vecsize
+ self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize
self.maxlen = maxlen
self.trained = False
def convert_traindata_embedvecs(self, classdict):
""" Convert the training text data into embedded matrix.
- COnvert the training text data into embedded matrix, where each short sentence
+ Convert the training text data into embedded matrix, where each short sentence
is a normalized summed embedded vectors for all words.
:param classdict: training data
@@ -57,9 +60,8 @@ def convert_traindata_embedvecs(self, classdict):
embedvecs = []
for classlabel in classlabels:
for shorttext in classdict[classlabel]:
- embedvec = np.sum(np.array([self.word_to_embedvec(token) for token in spacy_tokenize(shorttext)]),
+ embedvec = np.sum(np.array([self.word_to_embedvec(token) for token in tokenize(shorttext)]),
axis=0)
- # embedvec = np.reshape(embedvec, embedvec.shape+(1,))
norm = np.linalg.norm(embedvec)
if norm == 0:
continue
@@ -104,7 +106,7 @@ def savemodel(self, nameprefix):
Given the prefix of the file paths, save the model into files, with name given by the prefix.
There will be three files produced, one name ending with "_classlabels.txt", one name
- ending with ".json", and one name ending with ".h5".
+ ending with ".json", and one name ending with ".weights.h5".
If there is no trained model, a `ModelNotTrainedException` will be thrown.
:param nameprefix: prefix of the file path
@@ -113,7 +115,7 @@ def savemodel(self, nameprefix):
:raise: ModelNotTrainedException
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
kerasio.save_model(nameprefix, self.model)
labelfile = open(nameprefix+'_classlabels.txt', 'w')
labelfile.write('\n'.join(self.classlabels))
@@ -123,7 +125,7 @@ def loadmodel(self, nameprefix):
""" Load a trained model from files.
Given the prefix of the file paths, load the model from files with name given by the prefix
- followed by "_classlabels.txt", ".json", and ".h5".
+ followed by "_classlabels.txt", ".json", and ".weights.h5".
If this has not been run, or a model was not trained by :func:`~train`,
a `ModelNotTrainedException` will be raised while performing prediction and saving the model.
@@ -136,7 +138,7 @@ def loadmodel(self, nameprefix):
labelfile = open(nameprefix+'_classlabels.txt', 'r')
self.classlabels = labelfile.readlines()
labelfile.close()
- self.classlabels = map(lambda s: s.strip(), self.classlabels)
+ self.classlabels = [s.strip() for s in self.classlabels]
self.trained = True
def word_to_embedvec(self, word):
@@ -166,12 +168,9 @@ def shorttext_to_embedvec(self, shorttext):
:type shorttext: str
:rtype: numpy.ndarray
"""
- vec = np.zeros(self.vecsize)
- for token in spacy_tokenize(shorttext):
- if token in self.wvmodel:
- vec += self.wvmodel[token]
+ vec = np.sum([self.wvmodel[token] for token in tokenize(shorttext) if token in self.wvmodel])
norm = np.linalg.norm(vec)
- if norm!=0:
+ if norm != 0:
vec /= np.linalg.norm(vec)
return vec
@@ -191,7 +190,7 @@ def score(self, shorttext):
:raise: ModelNotTrainedException
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
# retrieve vector
embedvec = np.array(self.shorttext_to_embedvec(shorttext))
@@ -201,4 +200,26 @@ def score(self, shorttext):
# wrangle output result
scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)}
- return scoredict
\ No newline at end of file
+ return scoredict
+
+
+def load_varnnsumvec_classifier(wvmodel, name, compact=True, vecsize=None):
+ """ Load a :class:`shorttext.classifiers.VarNNSumEmbeddedVecClassifier` instance from file, given the pre-trained word-embedding model.
+
+ :param wvmodel: Word2Vec model
+ :param name: name (if compact=True) or prefix (if compact=False) of the file path
+ :param compact whether model file is compact (Default: True)
+ :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
+ :return: the classifier
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
+ :type name: str
+ :type compact: bool
+ :type vecsize: int
+ :rtype: VarNNSumEmbeddedVecClassifier
+ """
+ classifier = VarNNSumEmbeddedVecClassifier(wvmodel, vecsize=vecsize)
+ if compact:
+ classifier.load_compact_model(name)
+ else:
+ classifier.loadmodel(name)
+ return classifier
diff --git a/shorttext/classifiers/embed/sumvec/__init__.py b/src/shorttext/classifiers/embed/sumvec/__init__.py
similarity index 100%
rename from shorttext/classifiers/embed/sumvec/__init__.py
rename to src/shorttext/classifiers/embed/sumvec/__init__.py
diff --git a/shorttext/classifiers/embed/sumvec/frameworks.py b/src/shorttext/classifiers/embed/sumvec/frameworks.py
similarity index 68%
rename from shorttext/classifiers/embed/sumvec/frameworks.py
rename to src/shorttext/classifiers/embed/sumvec/frameworks.py
index 25bc7e5e..7372d76d 100644
--- a/shorttext/classifiers/embed/sumvec/frameworks.py
+++ b/src/shorttext/classifiers/embed/sumvec/frameworks.py
@@ -1,14 +1,15 @@
-from keras.layers import Dense
-from keras.models import Sequential
-from keras.regularizers import l2
-from shorttext.utils.classification_exceptions import UnequalArrayLengthsException
+from tensorflow.keras.layers import Dense, Activation
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.regularizers import l2
+
+from ....utils.classification_exceptions import UnequalArrayLengthsException
def DenseWordEmbed(nb_labels,
dense_nb_nodes=[],
dense_actfcn=[],
- vecsize=100,
+ vecsize=300,
reg_coef=0.1,
final_activiation='softmax',
optimizer='adam'):
@@ -19,7 +20,7 @@ def DenseWordEmbed(nb_labels,
:param nb_labels: number of class labels
:param dense_nb_nodes: number of nodes in each later (Default: [])
:param dense_actfcn: activation functions for each layer (Default: [])
- :param vecsize: length of the embedded vectors in the model (Default: 100)
+ :param vecsize: length of the embedded vectors in the model (Default: 300)
:param reg_coef: regularization coefficient (Default: 0.1)
:param final_activiation: activation function of the final layer (Default: softmax)
:param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam)
@@ -31,7 +32,7 @@ def DenseWordEmbed(nb_labels,
:type reg_coef: float
:type final_activiation: str
:type optimizer: str
- :rtype: keras.models.Sequential
+ :rtype: keras.models.Model
"""
if len(dense_nb_nodes)!=len(dense_actfcn):
raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn)
@@ -39,10 +40,7 @@ def DenseWordEmbed(nb_labels,
model = Sequential()
if nb_layers==0:
- model.add(Dense(nb_labels,
- input_shape=(vecsize,),
- activation=final_activiation,
- kernel_regularizer=l2(reg_coef)))
+ model.add(Dense(nb_labels, input_shape=(vecsize,), kernel_regularizer=l2(reg_coef)))
else:
model.add(Dense(dense_nb_nodes[0],
input_shape=(vecsize,),
@@ -50,14 +48,11 @@ def DenseWordEmbed(nb_labels,
kernel_regularizer=l2(reg_coef))
)
for nb_nodes, activation in zip(dense_nb_nodes[1:], dense_actfcn[1:]):
- model.add(Dense(nb_nodes,
- activation=activation,
- kernel_regularizer=l2(reg_coef))
- )
- model.add(Dense(nb_labels,
- activation=final_activiation,
- kernel_regularizer=l2(reg_coef))
- )
+ model.add(Dense(nb_nodes, activation=activation, kernel_regularizer=l2(reg_coef)))
+ model.add(Dense(nb_labels, kernel_regularizer=l2(reg_coef)))
+
+ # final activation layer
+ model.add(Activation(final_activiation))
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
\ No newline at end of file
diff --git a/src/shorttext/cli/__init__.py b/src/shorttext/cli/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/shorttext/cli/categorization.py b/src/shorttext/cli/categorization.py
new file mode 100644
index 00000000..b2fe88ba
--- /dev/null
+++ b/src/shorttext/cli/categorization.py
@@ -0,0 +1,104 @@
+
+import os
+from functools import partial
+import argparse
+import logging
+from operator import itemgetter
+
+from ..utils.compactmodel_io import get_model_classifier_name
+from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
+from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model
+from ..smartload import smartload_compact_model
+from ..classifiers import TopicVectorCosineDistanceClassifier
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+allowed_classifiers = [
+ 'ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder',
+ 'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent'
+]
+needembedded_classifiers = ['nnlibvec', 'sumvec']
+topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder']
+
+load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False)
+load_poincare_binary_model = partial(load_poincare_model, binary=True)
+
+typedict = {
+ 'word2vec': load_word2vec_model,
+ 'word2vec_nonbinary': load_word2vec_nonbinary_model,
+ 'fasttext': load_fasttext_model,
+ 'poincare': load_poincare_model,
+ 'poincare_binary': load_poincare_binary_model
+}
+
+
+def get_argparser():
+ parser = argparse.ArgumentParser(
+ description='Perform prediction on short text with a given trained model.'
+ )
+ parser.add_argument('model_filepath', help='Path of the trained (compact) model.')
+ parser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model.')
+ parser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
+ parser.add_argument('--topn', type=int, default=10, help='Number of top results to show.')
+ parser.add_argument('--inputtext', default=None, help='Single input text for classification. If omitted, will enter console mode.')
+ parser.add_argument('--type', default='word2vec', choices=typedict.keys(),
+ help='Type of word-embedding model (default: word2vec)')
+ return parser
+
+# main block
+def main():
+ # argument parsing
+ args = get_argparser().parse_args()
+
+ # check if the model file is given
+ if not os.path.exists(args.model_filepath):
+ raise IOError(f'Model file "{args.model_filepath}" not found!')
+
+ # get the name of the classifier
+ logger.info('Retrieving classifier name...')
+ classifier_name = get_model_classifier_name(args.model_filepath)
+
+ if classifier_name not in allowed_classifiers:
+ raise AlgorithmNotExistException(classifier_name)
+
+ # load the Word2Vec model if necessary
+ wvmodel = None
+ if classifier_name in needembedded_classifiers:
+ # check if the word embedding model is available
+ if not os.path.exists(args.wv):
+ raise WordEmbeddingModelNotExistException(args.wv)
+ # if there, load it
+ logger.info(f'Loading word-embedding model from {args.wv}...')
+ wvmodel = typedict[args.type](args.wv)
+
+ # load the classifier
+ logger.info('Initializing the classifier...')
+ if classifier_name in topicmodels:
+ topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
+ classifier = TopicVectorCosineDistanceClassifier(topicmodel)
+ else:
+ classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
+
+ # predict single input or run in console mode
+ if args.inputtext is not None:
+ if len(args.inputtext.strip()) == 0:
+ print('No input text provided.')
+ return
+ scoredict = classifier.score(args.inputtext)
+ for label, score in sorted(scoredict.items(), key=itemgetter(1), reverse=True)[:args.topn]:
+ print(f'{label} : {score:.4f}')
+ else:
+ # Console
+ print('Enter text to classify (empty input to quit):')
+ while True:
+ shorttext = input('text> ').strip()
+ if not shorttext:
+ break
+ scoredict = classifier.score(shorttext)
+ for label, score in sorted(scoredict.items(), key=itemgetter(1), reverse=True)[:args.topn]:
+ print(f'{label} : {score:.4f}')
+ print('Done.')
+
+if __name__ == "__main__":
+ main()
diff --git a/src/shorttext/cli/wordembedsim.py b/src/shorttext/cli/wordembedsim.py
new file mode 100644
index 00000000..db62fcbd
--- /dev/null
+++ b/src/shorttext/cli/wordembedsim.py
@@ -0,0 +1,55 @@
+
+import argparse
+import time
+
+from scipy.spatial.distance import cosine
+
+from ..metrics.embedfuzzy import jaccardscore_sents
+from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
+from ..utils import shorttext_to_avgvec
+from ..metrics.wasserstein import word_mover_distance
+from ..metrics.dynprog.jaccard import soft_jaccard_score
+
+
+typedict = {
+ 'word2vec': load_word2vec_model,
+ 'fasttext': load_fasttext_model,
+ 'poincare': load_poincare_model
+}
+
+
+def getargparser():
+ parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.')
+ parser.add_argument('modelpath', help='Path of the Word2Vec model')
+ parser.add_argument('--type', default='word2vec',
+ help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")')
+ return parser
+
+
+def main():
+ # argument parsing
+ args = getargparser().parse_args()
+
+ # preload tokenizer
+ tokenize('Mogu is cute.')
+
+ time0 = time.time()
+ print("Loading "+args.type+" model: "+args.modelpath)
+ wvmodel = typedict[args.type](args.modelpath)
+ time1 = time.time()
+ end = False
+ print("... loading time: "+str(time1 - time0)+" seconds")
+
+ while not end:
+ sent1 = input('sent1> ')
+ if len(sent1)==0:
+ end = True
+ else:
+ sent2 = input('sent2> ')
+
+ # output results
+ print("Cosine Similarity = %.4f" % (1 - cosine(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel))))
+ print("Word-embedding Jaccard Score Similarity = %.4f" % jaccardscore_sents(sent1, sent2, wvmodel))
+ print("Word Mover's Distance = %.4f" % word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel))
+ print("Soft Jaccard Score (edit distance) = %.4f" % soft_jaccard_score(tokenize(sent1), tokenize(sent2)))
+
diff --git a/src/shorttext/data/__init__.py b/src/shorttext/data/__init__.py
new file mode 100644
index 00000000..5a1efbff
--- /dev/null
+++ b/src/shorttext/data/__init__.py
@@ -0,0 +1,2 @@
+
+from .data_retrieval import subjectkeywords, nihreports, inaugural, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts
diff --git a/shorttext/data/data_retrieval.py b/src/shorttext/data/data_retrieval.py
similarity index 79%
rename from shorttext/data/data_retrieval.py
rename to src/shorttext/data/data_retrieval.py
index 5b5229f0..af0958a1 100644
--- a/shorttext/data/data_retrieval.py
+++ b/src/shorttext/data/data_retrieval.py
@@ -1,10 +1,12 @@
+
import random
from collections import defaultdict
import json
import os
import zipfile
-from urllib import urlretrieve
import sys
+import csv
+from urllib.request import urlretrieve
import pandas as pd
import numpy as np
@@ -22,26 +24,18 @@ class labels, and second column the text data. It returns a dictionary with
:type filepath: str
:rtype: dict
"""
- df = pd.read_csv(filepath)
- category_col, descp_col = df.columns.values.tolist()
- shorttextdict = defaultdict(lambda : [])
- for category, descp in zip(df[category_col], df[descp_col]):
- if type(descp)==str:
- shorttextdict[category] += [descp]
+ datafile = open(filepath, 'r')
+ reader = csv.reader(datafile)
+ headerread = False
+ shorttextdict = defaultdict(lambda: [])
+ for label, content in reader:
+ if headerread:
+ if isinstance(content, str):
+ shorttextdict[label] += [content]
+ else:
+ headerread = True
return dict(shorttextdict)
-# for backward compatibility
-def retrieve_data_as_dict(filepath):
- """ Retrieve the training data in a CSV file.
-
- This calls :func:`~retrieve_csvdata_as_dict` for backward compatibility.
-
- :param filepath: path of the training data (CSV)
- :return: a dictionary with class labels as keys, and lists of short texts
- :type filepath: str
- :rtype: dict
- """
- return retrieve_csvdata_as_dict(filepath)
def retrieve_jsondata_as_dict(filepath):
""" Retrieve the training data in a JSON file.
@@ -57,6 +51,7 @@ def retrieve_jsondata_as_dict(filepath):
"""
return json.load(open(filepath, 'r'))
+
def subjectkeywords():
""" Return an example data set of subjects.
@@ -69,7 +64,8 @@ def subjectkeywords():
this_dir, _ = os.path.split(__file__)
return retrieve_csvdata_as_dict(os.path.join(this_dir, 'shorttext_exampledata.csv'))
-def inaugual():
+
+def inaugural():
""" Return an example dataset, which is the Inaugural Addresses of all Presidents of
the United States from George Washington to Barack Obama.
@@ -80,8 +76,13 @@ def inaugual():
:rtype: dict
"""
zfile = zipfile.ZipFile(get_or_download_data("USInaugural.zip",
- "https://github.com/stephenhky/PyShortTextCategorization/blob/master/data/USInaugural.zip?raw=true"))
- return json.loads(zfile.open("addresses.json").read())
+ "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip",
+ asbytes=True),
+ )
+ address_jsonstr = zfile.open("addresses.json").read()
+ zfile.close()
+ return json.loads(address_jsonstr.decode('utf-8'))
+
def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512):
""" Return an example data set, sampled from NIH RePORT (Research Portfolio
@@ -116,26 +117,30 @@ def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512
raise KeyError('Undefined label column: '+label_col+'. Must be FUNDING_ICs or IC_NAME.')
zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip',
- 'https://github.com/stephenhky/PyShortTextCategorization/blob/master/data/nih_full.csv.zip?raw=true')
- )
- nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col])
+ 'https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/nih_full.csv.zip',
+ asbytes=True),
+ 'r',
+ zipfile.ZIP_DEFLATED)
+ nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col], encoding='cp437')
+ zfile.close()
nb_data = len(nih)
sample_size = nb_data if sample_size==None else min(nb_data, sample_size)
classdict = defaultdict(lambda : [])
for rowidx in np.random.randint(nb_data, size=min(nb_data, sample_size)):
- label = nih.ix[rowidx, label_col]
+ label = nih.iloc[rowidx, nih.columns.get_loc(label_col)]
if label_col=='FUNDING_ICs':
if label=='':
label = 'OTHER'
else:
endpos = label.index(':')
label = label[:endpos]
- classdict[label] += [nih.ix[rowidx, txt_col]]
+ classdict[label] += [nih.iloc[rowidx, nih.columns.get_loc(txt_col)]]
return dict(classdict)
+
def mergedict(dicts):
""" Merge data dictionary.
@@ -152,6 +157,7 @@ def mergedict(dicts):
mdict[label] += thisdict[label]
return dict(mdict)
+
def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False):
""" Yielding test data and training data for cross validation by partitioning it.
@@ -168,7 +174,7 @@ def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False):
:rtype: generator
"""
crossvaldicts = []
- for i in range(nb_partitions):
+ for _ in range(nb_partitions):
crossvaldicts.append(defaultdict(lambda: []))
for label in classdict:
@@ -177,14 +183,15 @@ def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False):
sentences = classdict[label] if not shuffle else random.shuffle(sentences)
for i in range(nb_partitions):
crossvaldicts[i][label] += sentences[i * partsize:min(nb_data, (i + 1) * partsize)]
- crossvaldicts = map(dict, crossvaldicts)
+ crossvaldicts = [dict(crossvaldict) for crossvaldict in crossvaldicts]
for i in range(nb_partitions):
testdict = crossvaldicts[i]
traindict = mergedict([crossvaldicts[j] for j in range(nb_partitions) if j != i])
yield testdict, traindict
-def get_or_download_data(filename, origin):
+
+def get_or_download_data(filename, origin, asbytes=False):
# determine path
homedir = os.path.expanduser('~')
datadir = os.path.join(homedir, '.shorttext')
@@ -194,15 +201,15 @@ def get_or_download_data(filename, origin):
targetfilepath = os.path.join(datadir, filename)
# download if not exist
if not os.path.exists(os.path.join(datadir, filename)):
- print 'Downloading...'
- print 'Source: ', origin
- print 'Target: ', targetfilepath
+ print('Downloading...')
+ print('Source: ', origin)
+ print('Target: ', targetfilepath)
try:
urlretrieve(origin, targetfilepath)
except:
- print 'Failure to download file!'
- print sys.exc_info()
+ print('Failure to download file!')
+ print(sys.exc_info())
os.remove(targetfilepath)
# return
- return open(targetfilepath, 'r')
\ No newline at end of file
+ return open(targetfilepath, 'rb' if asbytes else 'r')
diff --git a/shorttext/data/shorttext_exampledata.csv b/src/shorttext/data/shorttext_exampledata.csv
similarity index 100%
rename from shorttext/data/shorttext_exampledata.csv
rename to src/shorttext/data/shorttext_exampledata.csv
diff --git a/src/shorttext/generators/__init__.py b/src/shorttext/generators/__init__.py
new file mode 100644
index 00000000..a0bf818c
--- /dev/null
+++ b/src/shorttext/generators/__init__.py
@@ -0,0 +1,9 @@
+from .bow.GensimTopicModeling import load_gensimtopicmodel
+from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel
+
+from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
+from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
+
+from .charbase.char2vec import SentenceToCharVecEncoder, initSentenceToCharVecEncoder
+from .seq2seq.s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras
+from .seq2seq.charbaseS2S import CharBasedSeq2SeqGenerator, loadCharBasedSeq2SeqGenerator
diff --git a/shorttext/generators/bow/AutoEncodingTopicModeling.py b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py
similarity index 86%
rename from shorttext/generators/bow/AutoEncodingTopicModeling.py
rename to src/shorttext/generators/bow/AutoEncodingTopicModeling.py
index 1271e4d0..5c12b279 100644
--- a/shorttext/generators/bow/AutoEncodingTopicModeling.py
+++ b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py
@@ -1,25 +1,28 @@
+
import json
import pickle
+from functools import reduce
from operator import add
import numpy as np
from gensim.corpora import Dictionary
-from keras import Input
-from keras.engine import Model
-from keras.layers import Dense
+from tensorflow.keras import Input
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Dense
from scipy.spatial.distance import cosine
from .LatentTopicModeling import LatentTopicModeler
-from utils import compactmodel_io as cio, classification_exceptions as e, kerasmodel_io as kerasio, \
- textpreprocessing as textpreprocess
+from ...utils import kerasmodel_io as kerasio, textpreprocessing as textpreprocess
+from ...utils.compactmodel_io import CompactIOMachine
+from ...utils.classification_exceptions import ModelNotTrainedException
+
-autoencoder_suffices = ['.gensimdict', '_encoder.json', '_encoder.h5', '_classtopicvecs.pkl',
- '_decoder.json', '_decoder.h5', '_autoencoder.json', '_autoencoder.h5',
+autoencoder_suffices = ['.gensimdict', '_encoder.json', '_encoder.weights.h5', '_classtopicvecs.pkl',
+ '_decoder.json', '_decoder.weights.h5', '_autoencoder.json', '_autoencoder.weights.h5',
'.json']
-@cio.compactio({'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices)
-class AutoencodingTopicModeler(LatentTopicModeler):
+class AutoencodingTopicModeler(LatentTopicModeler, CompactIOMachine):
"""
This class facilitates the topic modeling of input training data using the autoencoder.
@@ -40,6 +43,7 @@ def train(self, classdict, nb_topics, *args, **kwargs):
:type classdict: dict
:type nb_topics: int
"""
+ CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices)
self.nb_topics = nb_topics
self.generate_corpus(classdict)
vecsize = len(self.dictionary)
@@ -65,8 +69,7 @@ def train(self, classdict, nb_topics, *args, **kwargs):
# process training data
embedvecs = np.array(reduce(add,
- [map(lambda shorttext: self.retrieve_bow_vector(shorttext, normalize=True),
- classdict[classtype])
+ [[self.retrieve_bow_vector(shorttext, normalize=True) for shorttext in classdict[classtype]]
for classtype in classdict]
)
)
@@ -99,7 +102,7 @@ def retrieve_topicvec(self, shorttext):
:rtype: numpy.ndarray
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
bow_vector = self.retrieve_bow_vector(shorttext)
encoded_vec = self.encoder.predict(np.array([bow_vector]))[0]
if self.normalize:
@@ -117,7 +120,7 @@ def precalculate_liststr_topicvec(self, shorttexts):
:type shorttexts: list
:rtype: numpy.ndarray
"""
- sumvec = sum(map(self.retrieve_topicvec, shorttexts))
+ sumvec = sum([self.retrieve_topicvec(shorttext) for shorttext in shorttexts])
sumvec /= np.linalg.norm(sumvec)
return sumvec
@@ -134,7 +137,7 @@ def get_batch_cos_similarities(self, shorttext):
:rtype: dict
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
simdict = {}
for label in self.classtopicvecs:
simdict[label] = 1 - cosine(self.classtopicvecs[label], self.retrieve_topicvec(shorttext))
@@ -144,11 +147,11 @@ def savemodel(self, nameprefix, save_complete_autoencoder=True):
""" Save the model with names according to the prefix.
Given the prefix of the file paths, save the model into files, with name given by the prefix.
- There are files with names ending with "_encoder.json" and "_encoder.h5", which are
+ There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are
the JSON and HDF5 files for the encoder respectively. They also include a gensim dictionary (.gensimdict).
If `save_complete_autoencoder` is True,
- then there are also files with names ending with "_decoder.json" and "_decoder.h5".
+ then there are also files with names ending with "_decoder.json" and "_decoder.weights.h5".
If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.
@@ -159,7 +162,7 @@ def savemodel(self, nameprefix, save_complete_autoencoder=True):
:type save_complete_autoencoder: bool
"""
if not self.trained:
- raise e.ModelNotTrainedException()
+ raise ModelNotTrainedException()
parameters = {}
parameters['nb_topics'] = self.nb_topics
@@ -171,13 +174,13 @@ def savemodel(self, nameprefix, save_complete_autoencoder=True):
if save_complete_autoencoder:
kerasio.save_model(nameprefix+'_decoder', self.decoder)
kerasio.save_model(nameprefix+'_autoencoder', self.autoencoder)
- pickle.dump(self.classtopicvecs, open(nameprefix+'_classtopicvecs.pkl', 'w'))
+ pickle.dump(self.classtopicvecs, open(nameprefix+'_classtopicvecs.pkl', 'wb'))
def loadmodel(self, nameprefix, load_incomplete=False):
""" Save the model with names according to the prefix.
Given the prefix of the file paths, load the model into files, with name given by the prefix.
- There are files with names ending with "_encoder.json" and "_encoder.h5", which are
+ There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are
the JSON and HDF5 files for the encoder respectively.
They also include a gensim dictionary (.gensimdict).
@@ -188,13 +191,13 @@ def loadmodel(self, nameprefix, load_incomplete=False):
:type load_incomplete: bool
"""
# load the JSON file (parameters)
- parameters = json.load(open(nameprefix+'.json', 'rb'))
+ parameters = json.load(open(nameprefix+'.json', 'r'))
self.nb_topics = parameters['nb_topics']
self.classlabels = parameters['classlabels']
self.dictionary = Dictionary.load(nameprefix + '.gensimdict')
self.encoder = kerasio.load_model(nameprefix+'_encoder')
- self.classtopicvecs = pickle.load(open(nameprefix+'_classtopicvecs.pkl', 'r'))
+ self.classtopicvecs = pickle.load(open(nameprefix+'_classtopicvecs.pkl', 'rb'))
if not load_incomplete:
self.decoder = kerasio.load_model(nameprefix+'_decoder')
self.autoencoder = kerasio.load_model(nameprefix+'_autoencoder')
@@ -220,4 +223,4 @@ def load_autoencoder_topicmodel(name,
autoencoder.load_compact_model(name)
else:
autoencoder.loadmodel(name)
- return autoencoder
\ No newline at end of file
+ return autoencoder
diff --git a/shorttext/generators/bow/GensimTopicModeling.py b/src/shorttext/generators/bow/GensimTopicModeling.py
similarity index 93%
rename from shorttext/generators/bow/GensimTopicModeling.py
rename to src/shorttext/generators/bow/GensimTopicModeling.py
index aa03fa3e..17e8f644 100644
--- a/shorttext/generators/bow/GensimTopicModeling.py
+++ b/src/shorttext/generators/bow/GensimTopicModeling.py
@@ -1,3 +1,4 @@
+
import json
import gensim
@@ -6,12 +7,12 @@
from gensim.models import TfidfModel, LdaModel, LsiModel, RpModel
from gensim.similarities import MatrixSimilarity
-import shorttext.utils.classification_exceptions as e
-import shorttext.utils.compactmodel_io as cio
-from shorttext.utils import gensim_corpora as gc
+from ...utils import classification_exceptions as e
+from ...utils.compactmodel_io import CompactIOMachine, get_model_classifier_name
+from ...utils import gensim_corpora as gc
from .LatentTopicModeling import LatentTopicModeler
-from shorttext.utils import textpreprocessing as textpreprocess
-from shorttext.utils.textpreprocessing import spacy_tokenize as tokenize
+from ...utils import textpreprocessing as textpreprocess
+from ...utils.textpreprocessing import tokenize
gensim_topic_model_dict = {'lda': LdaModel, 'lsi': LsiModel, 'rp': RpModel}
@@ -171,7 +172,7 @@ def loadmodel(self, nameprefix):
:type nameprefix: str
"""
# load the JSON file (parameters)
- parameters = json.load(open(nameprefix+'.json', 'rb'))
+ parameters = json.load(open(nameprefix+'.json', 'r'))
self.nb_topics = parameters['nb_topics']
self.toweigh = parameters['toweigh']
self.algorithm = parameters['algorithm']
@@ -214,7 +215,7 @@ def savemodel(self, nameprefix):
parameters['toweigh'] = self.toweigh
parameters['algorithm'] = self.algorithm
parameters['classlabels'] = self.classlabels
- json.dump(parameters, open(nameprefix+'.json', 'wb'))
+ json.dump(parameters, open(nameprefix+'.json', 'w'))
self.dictionary.save(nameprefix+'.gensimdict')
self.topicmodel.save(nameprefix+'.gensimmodel')
@@ -222,13 +223,14 @@ def savemodel(self, nameprefix):
if self.toweigh:
self.tfidf.save(nameprefix+'.gensimtfidf')
+
lda_suffices = ['.json', '.gensimdict', '.gensimmodel.state',
'.gensimtfidf', '.gensimmodel', '.gensimmat']
if gensim.__version__ >= '1.0.0':
lda_suffices += ['.gensimmodel.expElogbeta.npy', '.gensimmodel.id2word']
-@cio.compactio({'classifier': 'ldatopic'}, 'ldatopic', lda_suffices)
-class LDAModeler(GensimTopicModeler):
+
+class LDAModeler(GensimTopicModeler, CompactIOMachine):
"""
This class facilitates the creation of LDA (latent Dirichlet Allocation) topic models,
with the given short text training data, and convert future
@@ -245,12 +247,13 @@ def __init__(self,
algorithm='lda',
toweigh=toweigh,
normalize=normalize)
+ CompactIOMachine.__init__(self, {'classifier': 'ldatopic'}, 'ldatopic', lda_suffices)
+
lsi_suffices = ['.json', '.gensimdict', '.gensimtfidf', '.gensimmodel.projection',
'.gensimmodel', '.gensimmat', ]
-@cio.compactio({'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices)
-class LSIModeler(GensimTopicModeler):
+class LSIModeler(GensimTopicModeler, CompactIOMachine):
"""
This class facilitates the creation of LSI (latent semantic indexing) topic models,
with the given short text training data, and convert future
@@ -267,11 +270,12 @@ def __init__(self,
algorithm='lsi',
toweigh=toweigh,
normalize=normalize)
+ CompactIOMachine.__init__(self, {'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices)
+
rp_suffices = ['.json', '.gensimtfidf', '.gensimmodel', '.gensimmat', '.gensimdict']
-@cio.compactio({'classifier': 'rptopic'}, 'rptopic', rp_suffices)
-class RPModeler(GensimTopicModeler):
+class RPModeler(GensimTopicModeler, CompactIOMachine):
"""
This class facilitates the creation of RP (random projection) topic models,
with the given short text training data, and convert future
@@ -288,6 +292,7 @@ def __init__(self,
algorithm='rp',
toweigh=toweigh,
normalize=normalize)
+ CompactIOMachine.__init__(self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices)
def load_gensimtopicmodel(name,
@@ -306,7 +311,7 @@ def load_gensimtopicmodel(name,
"""
if compact:
modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler}
- classifier_name = str(cio.get_model_classifier_name(name))
+ classifier_name = str(get_model_classifier_name(name))
topicmodeler = modelerdict[classifier_name](preprocessor=preprocessor)
topicmodeler.load_compact_model(name)
diff --git a/shorttext/generators/bow/LatentTopicModeling.py b/src/shorttext/generators/bow/LatentTopicModeling.py
similarity index 93%
rename from shorttext/generators/bow/LatentTopicModeling.py
rename to src/shorttext/generators/bow/LatentTopicModeling.py
index 4d8e8863..f1e12319 100644
--- a/shorttext/generators/bow/LatentTopicModeling.py
+++ b/src/shorttext/generators/bow/LatentTopicModeling.py
@@ -1,10 +1,13 @@
+
+from abc import ABC, abstractmethod
+
import numpy as np
-from shorttext.utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e
-from shorttext.utils.textpreprocessing import spacy_tokenize as tokenize
+from ...utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e
+from ...utils.textpreprocessing import tokenize
# abstract class
-class LatentTopicModeler:
+class LatentTopicModeler(ABC):
"""
Abstract class for various topic modeler.
"""
@@ -32,7 +35,7 @@ def generate_corpus(self, classdict):
"""
self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict,
preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent)))
-
+ @abstractmethod
def train(self, classdict, nb_topics, *args, **kwargs):
""" Train the modeler.
@@ -78,6 +81,7 @@ def retrieve_bow_vector(self, shorttext, normalize=True):
vec /= np.linalg.norm(vec)
return vec
+ @abstractmethod
def retrieve_topicvec(self, shorttext):
""" Calculate the topic vector representation of the short text.
@@ -91,6 +95,7 @@ def retrieve_topicvec(self, shorttext):
"""
raise e.NotImplementedException()
+ @abstractmethod
def get_batch_cos_similarities(self, shorttext):
""" Calculate the cosine similarities of the given short text and all the class labels.
@@ -112,6 +117,7 @@ def __contains__(self, shorttext):
raise e.ModelNotTrainedException()
return True
+ @abstractmethod
def loadmodel(self, nameprefix):
""" Load the model from files.
@@ -124,6 +130,7 @@ def loadmodel(self, nameprefix):
"""
raise e.NotImplementedException()
+ @abstractmethod
def savemodel(self, nameprefix):
""" Save the model to files.
diff --git a/shorttext/generators/bow/__init__.py b/src/shorttext/generators/bow/__init__.py
similarity index 67%
rename from shorttext/generators/bow/__init__.py
rename to src/shorttext/generators/bow/__init__.py
index 5aad70c0..705c2c41 100644
--- a/shorttext/generators/bow/__init__.py
+++ b/src/shorttext/generators/bow/__init__.py
@@ -1,3 +1,4 @@
+
from . import AutoEncodingTopicModeling
from . import GensimTopicModeling
-from . import LatentTopicModeling
\ No newline at end of file
+from . import LatentTopicModeling
diff --git a/src/shorttext/generators/charbase/__init__.py b/src/shorttext/generators/charbase/__init__.py
new file mode 100644
index 00000000..02f47f7c
--- /dev/null
+++ b/src/shorttext/generators/charbase/__init__.py
@@ -0,0 +1,3 @@
+
+from . import char2vec
+
diff --git a/src/shorttext/generators/charbase/char2vec.py b/src/shorttext/generators/charbase/char2vec.py
new file mode 100644
index 00000000..b62df6dc
--- /dev/null
+++ b/src/shorttext/generators/charbase/char2vec.py
@@ -0,0 +1,103 @@
+
+from functools import partial
+
+import numpy as np
+from scipy.sparse import csc_matrix
+from gensim.corpora import Dictionary
+from sklearn.preprocessing import OneHotEncoder
+
+from ...utils.misc import textfile_generator
+
+
+class SentenceToCharVecEncoder:
+ """ A class that facilitates one-hot encoding from characters to vectors.
+
+ """
+ def __init__(self, dictionary, signalchar='\n'):
+ """ Initialize the one-hot encoding class.
+
+ :param dictionary: a gensim dictionary
+ :param signalchar: signal character, useful for seq2seq models (Default: '\n')
+ :type dictionary: gensim.corpora.Dictionary
+ :type signalchar: str
+ """
+ self.dictionary = dictionary
+ self.signalchar = signalchar
+ numchars = len(self.dictionary)
+ self.onehot_encoder = OneHotEncoder()
+ self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
+
+ def calculate_prelim_vec(self, sent):
+ """ Convert the sentence to a one-hot vector.
+
+ :param sent: sentence
+ :return: a one-hot vector, with each element the code of that character
+ :type sent: str
+ :rtype: numpy.array
+ """
+ return self.onehot_encoder.transform(
+ np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1))
+ )
+
+ def encode_sentence(self, sent, maxlen, startsig=False, endsig=False):
+ """ Encode one sentence to a sparse matrix, with each row the expanded vector of each character.
+
+ :param sent: sentence
+ :param maxlen: maximum length of the sentence
+ :param startsig: signal character at the beginning of the sentence (Default: False)
+ :param endsig: signal character at the end of the sentence (Default: False)
+ :return: matrix representing the sentence
+ :type sent: str
+ :type maxlen: int
+ :type startsig: bool
+ :type endsig: bool
+ :rtype: scipy.sparse.csc_matrix
+ """
+ cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '')
+ sent_vec = self.calculate_prelim_vec(cor_sent).tocsc()
+ if sent_vec.shape[0] == maxlen + startsig + endsig:
+ return sent_vec
+ else:
+ return csc_matrix((sent_vec.data, sent_vec.indices, sent_vec.indptr),
+ shape=(maxlen + startsig + endsig, sent_vec.shape[1]),
+ dtype=np.float64)
+
+ def encode_sentences(self, sentences, maxlen, sparse=True, startsig=False, endsig=False):
+ """ Encode many sentences into a rank-3 tensor.
+
+ :param sentences: sentences
+ :param maxlen: maximum length of one sentence
+ :param sparse: whether to return a sparse matrix (Default: True)
+ :param startsig: signal character at the beginning of the sentence (Default: False)
+ :param endsig: signal character at the end of the sentence (Default: False)
+ :return: rank-3 tensor of the sentences
+ :type sentences: list
+ :type maxlen: int
+ :type sparse: bool
+ :type startsig: bool
+ :type endsig: bool
+ :rtype: scipy.sparse.csc_matrix or numpy.array
+ """
+ encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen)
+ list_encoded_sentences_map = map(encode_sent_func, sentences)
+ if sparse:
+ return list(list_encoded_sentences_map)
+ else:
+ return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map])
+
+ def __len__(self):
+ return len(self.dictionary)
+
+
+def initSentenceToCharVecEncoder(textfile, encoding=None):
+ """ Instantiate a class of SentenceToCharVecEncoder from a text file.
+
+ :param textfile: text file
+ :param encoding: encoding of the text file (Default: None)
+ :return: an instance of SentenceToCharVecEncoder
+ :type textfile: file
+ :type encoding: str
+ :rtype: SentenceToCharVecEncoder
+ """
+ dictionary = Dictionary(map(lambda line: [c for c in line], textfile_generator(textfile, encoding=encoding)))
+ return SentenceToCharVecEncoder(dictionary)
diff --git a/src/shorttext/generators/seq2seq/__init__.py b/src/shorttext/generators/seq2seq/__init__.py
new file mode 100644
index 00000000..40c864ad
--- /dev/null
+++ b/src/shorttext/generators/seq2seq/__init__.py
@@ -0,0 +1,3 @@
+
+from . import s2skeras
+from . import charbaseS2S
diff --git a/src/shorttext/generators/seq2seq/charbaseS2S.py b/src/shorttext/generators/seq2seq/charbaseS2S.py
new file mode 100644
index 00000000..604c9f54
--- /dev/null
+++ b/src/shorttext/generators/seq2seq/charbaseS2S.py
@@ -0,0 +1,193 @@
+
+import json
+
+import numpy as np
+import gensim
+
+from .s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras, kerasseq2seq_suffices
+from ..charbase.char2vec import SentenceToCharVecEncoder
+from ...utils import compactmodel_io as cio
+
+
+charbases2s_suffices = kerasseq2seq_suffices + ['_dictionary.dict', '_charbases2s.json']
+
+
+class CharBasedSeq2SeqGenerator(cio.CompactIOMachine):
+ """ Class implementing character-based sequence-to-sequence (seq2seq) learning model.
+
+ This class implements the seq2seq model at the character level. This class calls
+ :class:`Seq2SeqWithKeras`.
+
+ Reference:
+
+ Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv
+ `_]
+ """
+ def __init__(self, sent2charvec_encoder, latent_dim, maxlen):
+ """ Instantiate the class.
+
+ If no one-hot encoder passed in, no compilation will be performed.
+
+ :param sent2charvec_encoder: the one-hot encoder
+ :param latent_dim: number of latent dimension
+ :param maxlen: maximum length of a sentence
+ :type sent2charvec_encoder: SentenceToCharVecEncoder
+ :type latent_dim: int
+ :type maxlen: int
+ """
+ cio.CompactIOMachine.__init__(self, {'classifier': 'charbases2s'}, 'charbases2s', charbases2s_suffices)
+ self.compiled = False
+ if sent2charvec_encoder != None:
+ self.sent2charvec_encoder = sent2charvec_encoder
+ self.dictionary = self.sent2charvec_encoder.dictionary
+ self.nbelem = len(self.dictionary)
+ self.latent_dim = latent_dim
+ self.maxlen = maxlen
+ self.s2sgenerator = Seq2SeqWithKeras(self.nbelem, self.latent_dim)
+
+ def compile(self, optimizer='rmsprop', loss='categorical_crossentropy'):
+ """ Compile the keras model.
+
+ :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop)
+ :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`)
+ :return: None
+ :type optimizer: str
+ :type loss: str
+ """
+ if not self.compiled:
+ self.s2sgenerator.prepare_model()
+ self.s2sgenerator.compile(optimizer=optimizer, loss=loss)
+ self.compiled = True
+
+ def prepare_trainingdata(self, txtseq):
+ """ Transforming sentence to a sequence of numerical vectors.
+
+ :param txtseq: text
+ :return: rank-3 tensors for encoder input, decoder input, and decoder output
+ :type txtseq: str
+ :rtype: (numpy.array, numpy.array, numpy.array)
+ """
+ encoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[:-1], startsig=True, maxlen=self.maxlen, sparse=False)
+ decoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[1:], startsig=True, maxlen=self.maxlen, sparse=False)
+ decoder_output = self.sent2charvec_encoder.encode_sentences(txtseq[1:], endsig=True, maxlen=self.maxlen, sparse=False)
+ return encoder_input, decoder_input, decoder_output
+
+ def train(self, txtseq, batch_size=64, epochs=100, optimizer='rmsprop', loss='categorical_crossentropy'):
+ """ Train the character-based seq2seq model.
+
+ :param txtseq: text
+ :param batch_size: batch size (Default: 64)
+ :param epochs: number of epochs (Default: 100)
+ :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop)
+ :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`)
+ :return: None
+ :type txtseq: str
+ :type batch_size: int
+ :type epochs: int
+ :type optimizer: str
+ :type loss: str
+ """
+ encoder_input, decoder_input, decoder_output = self.prepare_trainingdata(txtseq)
+ self.compile(optimizer=optimizer, loss=loss)
+ self.s2sgenerator.fit(encoder_input, decoder_input, decoder_output, batch_size=batch_size, epochs=epochs)
+
+ def decode(self, txtseq, stochastic=True):
+ """ Given an input text, produce the output text.
+
+ :param txtseq: input text
+ :return: output text
+ :type txtseq: str
+ :rtype: str
+ """
+ # Encode the input as state vectors.
+ inputvec = np.array([self.sent2charvec_encoder.encode_sentence(txtseq, maxlen=self.maxlen, endsig=True).toarray()])
+ states_value = self.s2sgenerator.encoder_model.predict(inputvec)
+
+ # Generate empty target sequence of length 1.
+ target_seq = np.zeros((1, 1, self.nbelem))
+ # Populate the first character of target sequence with the start character.
+ target_seq[0, 0, self.dictionary.token2id['\n']] = 1.
+
+ # Sampling loop for a batch of sequences
+ # (to simplify, here we assume a batch of size 1).
+ stop_condition = False
+ decoded_txtseq = ''
+ while not stop_condition:
+ output_tokens, h, c = self.s2sgenerator.decoder_model.predict([target_seq] + states_value)
+
+ # Sample a token
+ if stochastic:
+ sampled_token_index = np.random.choice(np.arange(output_tokens.shape[2]),
+ p=output_tokens[0, -1, :])
+ else:
+ sampled_token_index = np.argmax(output_tokens[0, -1, :])
+ sampled_char = self.dictionary[sampled_token_index]
+ decoded_txtseq += sampled_char
+
+ # Exit condition: either hit max length
+ # or find stop character.
+ if (sampled_char == '\n' or len(decoded_txtseq) > self.maxlen):
+ stop_condition = True
+
+ # Update the target sequence (of length 1).
+ target_seq = np.zeros((1, 1, self.nbelem))
+ target_seq[0, 0, sampled_token_index] = 1.
+
+ # Update states
+ states_value = [h, c]
+
+ return decoded_txtseq
+
+ def savemodel(self, prefix, final=False):
+ """ Save the trained models into multiple files.
+
+ To save it compactly, call :func:`~save_compact_model`.
+
+ If `final` is set to `True`, the model cannot be further trained.
+
+ If there is no trained model, a `ModelNotTrainedException` will be thrown.
+
+ :param prefix: prefix of the file path
+ :param final: whether the model is final (that should not be trained further) (Default: False)
+ :return: None
+ :type prefix: str
+ :type final: bool
+ :raise: ModelNotTrainedException
+ """
+ self.s2sgenerator.savemodel(prefix, final=final)
+ self.dictionary.save(prefix+'_dictionary.dict')
+ json.dump({'maxlen': self.maxlen, 'latent_dim': self.latent_dim}, open(prefix+'_charbases2s.json', 'w'))
+
+ def loadmodel(self, prefix):
+ """ Load a trained model from various files.
+
+ To load a compact model, call :func:`~load_compact_model`.
+
+ :param prefix: prefix of the file path
+ :return: None
+ :type prefix: str
+ """
+ self.dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict')
+ self.s2sgenerator = loadSeq2SeqWithKeras(prefix, compact=False)
+ self.sent2charvec_encoder = SentenceToCharVecEncoder(self.dictionary)
+ self.nbelem = len(self.dictionary)
+ hyperparameters = json.load(open(prefix+'_charbases2s.json', 'r'))
+ self.latent_dim, self.maxlen = hyperparameters['latent_dim'], hyperparameters['maxlen']
+ self.compiled = True
+
+def loadCharBasedSeq2SeqGenerator(path, compact=True):
+ """ Load a trained `CharBasedSeq2SeqGenerator` class from file.
+
+ :param path: path of the model file
+ :param compact: whether it is a compact model (Default: True)
+ :return: a `CharBasedSeq2SeqGenerator` class for sequence to sequence inference
+ :type path: str
+ :type compact: bool
+ :rtype: CharBasedSeq2SeqGenerator
+ """
+ seq2seqer = CharBasedSeq2SeqGenerator(None, 0, 0)
+ if compact:
+ seq2seqer.load_compact_model(path)
+ else:
+ seq2seqer.loadmodel(path)
+ return seq2seqer
\ No newline at end of file
diff --git a/src/shorttext/generators/seq2seq/s2skeras.py b/src/shorttext/generators/seq2seq/s2skeras.py
new file mode 100644
index 00000000..36a9c79e
--- /dev/null
+++ b/src/shorttext/generators/seq2seq/s2skeras.py
@@ -0,0 +1,195 @@
+
+import json
+
+from tensorflow.keras.models import load_model
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, LSTM, Dense
+
+from ...utils import compactmodel_io as cio
+from ...utils import classification_exceptions as e
+
+# Reference: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
+
+kerasseq2seq_suffices = ['.weights.h5', '.json', '_s2s_hyperparam.json', '_encoder.weights.h5', '_encoder.json', '_decoder.h5', '_decoder.weights.json']
+
+
+class Seq2SeqWithKeras(cio.CompactIOMachine):
+ """ Class implementing sequence-to-sequence (seq2seq) learning with keras.
+
+ Reference:
+
+ Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto
+ `_]
+
+ Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv
+ `_]
+
+ Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras
+ `_]
+
+ Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly
+ `_]
+ """
+ def __init__(self, vecsize, latent_dim):
+ """ Instantiate the class.
+
+ :param vecsize: vector size of the sequence
+ :param latent_dim: latent dimension in the RNN cell
+ :type vecsize: int
+ :type latent_dim: int
+ """
+ cio.CompactIOMachine.__init__(self, {'classifier': 'kerasseq2seq'}, 'kerasseq2seq', kerasseq2seq_suffices)
+ self.vecsize = vecsize
+ self.latent_dim = latent_dim
+ self.compiled = False
+ self.trained = False
+
+ def prepare_model(self):
+ """ Prepare the keras model.
+
+ :return: None
+ """
+ # Define an input sequence and process it.
+ encoder_inputs = Input(shape=(None, self.vecsize))
+ encoder = LSTM(self.latent_dim, return_state=True)
+ encoder_outputs, state_h, state_c = encoder(encoder_inputs)
+ # We discard `encoder_outputs` and only keep the states.
+ encoder_states = [state_h, state_c]
+
+ # Set up the decoder, using `encoder_states` as initial state.
+ decoder_inputs = Input(shape=(None, self.vecsize))
+ # We set up our decoder to return full output sequences,
+ # and to return internal states as well. We don't use the
+ # return states in the training model, but we will use them in inference.
+ decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True)
+ decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
+ initial_state=encoder_states)
+ decoder_dense = Dense(self.vecsize, activation='softmax')
+ decoder_outputs = decoder_dense(decoder_outputs)
+
+ # Define the model that will turn
+ # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
+ model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
+
+ # Define sampling models
+ encoder_model = Model(encoder_inputs, encoder_states)
+
+ decoder_state_input_h = Input(shape=(self.latent_dim,))
+ decoder_state_input_c = Input(shape=(self.latent_dim,))
+ decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+ decoder_outputs, state_h, state_c = decoder_lstm(
+ decoder_inputs, initial_state=decoder_states_inputs)
+ decoder_states = [state_h, state_c]
+ decoder_outputs = decoder_dense(decoder_outputs)
+ decoder_model = Model([decoder_inputs] + decoder_states_inputs,
+ [decoder_outputs] + decoder_states)
+
+ self.model = model
+ self.encoder_model = encoder_model
+ self.decoder_model = decoder_model
+
+ def compile(self, optimizer='rmsprop', loss='categorical_crossentropy'):
+ """ Compile the keras model after preparation running :func:`~prepare_model`.
+
+ :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop)
+ :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`)
+ :type optimizer: str
+ :type loss: str
+ :return: None
+ """
+ self.model.compile(optimizer=optimizer, loss=loss)
+ self.compiled = True
+
+ def fit(self, encoder_input, decoder_input, decoder_output, batch_size=64, epochs=100):
+ """ Fit the sequence to learn the sequence-to-sequence (seq2seq) model.
+
+ :param encoder_input: encoder input, a rank-3 tensor
+ :param decoder_input: decoder input, a rank-3 tensor
+ :param decoder_output: decoder output, a rank-3 tensor
+ :param batch_size: batch size (Default: 64)
+ :param epochs: number of epochs (Default: 100)
+ :return: None
+ :type encoder_input: numpy.array
+ :type decoder_input: numpy.array
+ :type decoder_output: numpy.array
+ :type batch_size: int
+ :type epochs: int
+ """
+ self.model.fit([encoder_input, decoder_input], decoder_output,
+ batch_size=batch_size,
+ epochs=epochs)
+ self.trained = True
+
+ def savemodel(self, prefix, final=False):
+ """ Save the trained models into multiple files.
+
+ To save it compactly, call :func:`~save_compact_model`.
+
+ If `final` is set to `True`, the model cannot be further trained.
+
+ If there is no trained model, a `ModelNotTrainedException` will be thrown.
+
+ :param prefix: prefix of the file path
+ :param final: whether the model is final (that should not be trained further) (Default: False)
+ :return: None
+ :type prefix: str
+ :type final: bool
+ :raise: ModelNotTrainedException
+ """
+ if not self.trained:
+ raise e.ModelNotTrainedException()
+
+ # save hyperparameters
+ json.dump({'vecsize': self.vecsize, 'latent_dim': self.latent_dim}, open(prefix+'_s2s_hyperparam.json', 'w'))
+
+ # save whole model
+ if final:
+ self.model.save_weights(prefix+'.weights.h5')
+ else:
+ self.model.save(prefix+'.weights.h5')
+ open(prefix+'.json', 'w').write(self.model.to_json())
+
+ # save encoder and decoder
+ if final:
+ self.encoder_model.save_weights(prefix+'_encoder.weights.h5')
+ self.decoder_model.save_weights(prefix + '_decoder.weights.h5')
+ else:
+ self.encoder_model.save(prefix + '_encoder.weights.h5')
+ self.decoder_model.save(prefix+'_decoder.weights.h5')
+ open(prefix+'_encoder.json', 'w').write(self.encoder_model.to_json())
+ open(prefix+'_decoder.json', 'w').write(self.decoder_model.to_json())
+
+ def loadmodel(self, prefix):
+ """ Load a trained model from various files.
+
+ To load a compact model, call :func:`~load_compact_model`.
+
+ :param prefix: prefix of the file path
+ :return: None
+ :type prefix: str
+ """
+ hyperparameters = json.load(open(prefix+'_s2s_hyperparam.json', 'r'))
+ self.vecsize, self.latent_dim = hyperparameters['vecsize'], hyperparameters['latent_dim']
+ self.model = load_model(prefix+'.weights.h5')
+ self.encoder_model = load_model(prefix+'_encoder.weights.h5')
+ self.decoder_model = load_model(prefix+'_decoder.weights.h5')
+ self.trained = True
+
+
+def loadSeq2SeqWithKeras(path, compact=True):
+ """ Load a trained `Seq2SeqWithKeras` class from file.
+
+ :param path: path of the model file
+ :param compact: whether it is a compact model (Default: True)
+ :return: a `Seq2SeqWithKeras` class for sequence to sequence inference
+ :type path: str
+ :type compact: bool
+ :rtype: Seq2SeqWithKeras
+ """
+ generator = Seq2SeqWithKeras(0, 0)
+ if compact:
+ generator.load_compact_model(path)
+ else:
+ generator.loadmodel(path)
+ generator.compiled = True
+ return generator
\ No newline at end of file
diff --git a/src/shorttext/metrics/__init__.py b/src/shorttext/metrics/__init__.py
new file mode 100644
index 00000000..bfb2af18
--- /dev/null
+++ b/src/shorttext/metrics/__init__.py
@@ -0,0 +1,4 @@
+
+from . import dynprog
+from . import embedfuzzy
+from . import wasserstein
diff --git a/src/shorttext/metrics/dynprog/__init__.py b/src/shorttext/metrics/dynprog/__init__.py
new file mode 100644
index 00000000..4bb67a12
--- /dev/null
+++ b/src/shorttext/metrics/dynprog/__init__.py
@@ -0,0 +1,4 @@
+
+from . import dldist
+from . import jaccard
+from . import lcp
diff --git a/src/shorttext/metrics/dynprog/dldist.py b/src/shorttext/metrics/dynprog/dldist.py
new file mode 100644
index 00000000..d8ff2f2d
--- /dev/null
+++ b/src/shorttext/metrics/dynprog/dldist.py
@@ -0,0 +1,41 @@
+
+import numpy as np
+import numba as nb
+
+
+@nb.njit
+def damerau_levenshtein(word1: str, word2: str) -> int:
+ """ Calculate the Demarau-Levenshtein (DL) distance between two words.
+
+ :param word1: first word
+ :param word2: seccond word
+ :return: Damerau-Levenshtein (DL) distance
+ :type word1: str
+ :type word2: str
+ :rtype: int
+ """
+ len1 = len(word1)
+ len2 = len(word2)
+ matrix = np.zeros((len1+1, len2+1), dtype=np.int8)
+
+ for i in range(len1+1):
+ matrix[i, 0] = i
+ for j in range(len2+1):
+ matrix[0, j] = j
+
+ for i in range(len1+1):
+ for j in range(len2+1):
+ cost = 0
+ if i > 0 and j > 0 and (word1[i-1] != word2[j-1]):
+ cost = 1
+ delcost = matrix[i-1, j] + 1
+ inscost = matrix[i, j-1] + 1
+ subcost = matrix[i-1, j-1] + cost
+ score = min(min(delcost, inscost), subcost)
+ if ((i > 1) & (j > 1) & (word1[i - 1] == word2[j - 2]) & (word1[i - 2] == word2[j - 1])):
+ score = min(score, matrix[i-2, j-2] + cost)
+ matrix[i, j] = score
+
+ print(matrix)
+
+ return matrix[len1, len2]
diff --git a/src/shorttext/metrics/dynprog/jaccard.py b/src/shorttext/metrics/dynprog/jaccard.py
new file mode 100644
index 00000000..64bc5877
--- /dev/null
+++ b/src/shorttext/metrics/dynprog/jaccard.py
@@ -0,0 +1,73 @@
+
+from itertools import product
+
+from .dldist import damerau_levenshtein
+from .lcp import longest_common_prefix
+
+
+def similarity(word1, word2):
+ """ Return the similarity between the two words.
+
+ Return the similarity between the two words, between 0 and 1 inclusively.
+ The similarity is the maximum of the two values:
+ - 1 - Damerau-Levenshtein distance between two words / maximum length of the two words
+ - longest common prefix of the two words / maximum length of the two words
+
+ Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
+ `_]
+
+ :param word1: a word
+ :param word2: a word
+ :return: similarity, between 0 and 1 inclusively
+ :type word1: str
+ :type word2: str
+ :rtype: float
+ """
+ maxlen = max(len(word1), len(word2))
+ editdistance = damerau_levenshtein(word1, word2)
+ lcp = longest_common_prefix(word1, word2)
+ return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen)
+
+
+def soft_intersection_list(tokens1, tokens2):
+ """ Return the soft number of intersections between two lists of tokens.
+
+ :param tokens1: list of tokens.
+ :param tokens2: list of tokens.
+ :return: soft number of intersections.
+ :type tokens1: list
+ :type tokens2: list
+ :rtype: float
+ """
+ intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)]
+ intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True)
+
+ included_list = set()
+ used_tokens1 = set()
+ used_tokens2 = set()
+ for (token1, token2), sim in intersected_list:
+ if (not (token1 in used_tokens1)) and (not (token2 in used_tokens2)):
+ included_list.add(((token1, token2), sim))
+ used_tokens1.add(token1)
+ used_tokens2.add(token2)
+
+ return included_list
+
+
+def soft_jaccard_score(tokens1, tokens2):
+ """ Return the soft Jaccard score of the two lists of tokens, between 0 and 1 inclusively.
+
+ Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE
+ `_]
+
+ :param tokens1: list of tokens.
+ :param tokens2: list of tokens.
+ :return: soft Jaccard score, between 0 and 1 inclusively.
+ :type tokens1: list
+ :type tokens2: list
+ :rtype: float
+ """
+ intersection_list = soft_intersection_list(tokens1, tokens2)
+ num_intersections = sum([item[1] for item in intersection_list])
+ num_unions = len(tokens1) + len(tokens2) - num_intersections
+ return float(num_intersections)/float(num_unions)
diff --git a/src/shorttext/metrics/dynprog/lcp.py b/src/shorttext/metrics/dynprog/lcp.py
new file mode 100644
index 00000000..acec9515
--- /dev/null
+++ b/src/shorttext/metrics/dynprog/lcp.py
@@ -0,0 +1,22 @@
+
+import numba as nb
+
+
+@nb.njit
+def longest_common_prefix(word1: str, word2: str) -> int:
+ """ Calculate the longest common prefix (LCP) between two words.
+
+ :param word1: first word
+ :param word2: seccond word
+ :return: longest common prefix (LCP)
+ :type word1: str
+ :type word2: str
+ :rtype: int
+ """
+ lcp = 0
+ for i in range(min(len(word1), len(word2))):
+ if word1[i] == word2[i]:
+ lcp += 1
+ else:
+ break
+ return lcp
diff --git a/src/shorttext/metrics/embedfuzzy/__init__.py b/src/shorttext/metrics/embedfuzzy/__init__.py
new file mode 100644
index 00000000..d1b9b4d9
--- /dev/null
+++ b/src/shorttext/metrics/embedfuzzy/__init__.py
@@ -0,0 +1,2 @@
+
+from .jaccard import jaccardscore_sents
\ No newline at end of file
diff --git a/src/shorttext/metrics/embedfuzzy/jaccard.py b/src/shorttext/metrics/embedfuzzy/jaccard.py
new file mode 100644
index 00000000..042df113
--- /dev/null
+++ b/src/shorttext/metrics/embedfuzzy/jaccard.py
@@ -0,0 +1,50 @@
+
+from itertools import product
+
+import numpy as np
+from scipy.spatial.distance import cosine
+
+from ...utils import tokenize
+
+
+def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)):
+ """ Compute the Jaccard score between sentences based on their word similarities.
+
+ :param sent1: first sentence
+ :param sent2: second sentence
+ :param wvmodel: word-embeding model
+ :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine)
+ :return: soft Jaccard score
+ :type sent1: str
+ :type sent2: str
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
+ :type sim_words: function
+ :rtype: float
+ """
+ tokens1 = tokenize(sent1)
+ tokens2 = tokenize(sent2)
+ tokens1 = list(filter(lambda w: w in wvmodel, tokens1))
+ tokens2 = list(filter(lambda w: w in wvmodel, tokens2))
+ allowable1 = [True] * len(tokens1)
+ allowable2 = [True] * len(tokens2)
+
+ simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]])
+ for i, j in product(range(len(tokens1)), range(len(tokens2)))}
+
+ intersection = 0.0
+ simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True)
+ for idxtuple, sim in simdictitems:
+ i, j = idxtuple
+ if allowable1[i] and allowable2[j]:
+ intersection += sim
+ allowable1[i] = False
+ allowable2[j] = False
+
+ union = len(tokens1) + len(tokens2) - intersection
+
+ if union > 0:
+ return intersection / union
+ elif intersection == 0:
+ return 1.
+ else:
+ return np.inf
diff --git a/src/shorttext/metrics/wasserstein/__init__.py b/src/shorttext/metrics/wasserstein/__init__.py
new file mode 100755
index 00000000..d274bad0
--- /dev/null
+++ b/src/shorttext/metrics/wasserstein/__init__.py
@@ -0,0 +1,2 @@
+
+from .wordmoverdist import word_mover_distance_linprog, word_mover_distance
\ No newline at end of file
diff --git a/src/shorttext/metrics/wasserstein/wordmoverdist.py b/src/shorttext/metrics/wasserstein/wordmoverdist.py
new file mode 100644
index 00000000..c127d841
--- /dev/null
+++ b/src/shorttext/metrics/wasserstein/wordmoverdist.py
@@ -0,0 +1,94 @@
+
+from itertools import product
+import warnings
+
+import numpy as np
+from scipy.spatial.distance import euclidean
+from scipy.sparse import csr_matrix
+from scipy.optimize import linprog
+
+from ...utils.gensim_corpora import tokens_to_fracdict
+
+
+def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean):
+ """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class.
+
+ Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
+ model has to be provided. The whole `scipy.optimize.Optimize` object is returned.
+
+ Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
+
+ :param first_sent_tokens: first list of tokens.
+ :param second_sent_tokens: second list of tokens.
+ :param wvmodel: word-embedding models.
+ :param distancefunc: distance function that takes two numpy ndarray.
+ :return: the whole result of the linear programming problem
+ :type first_sent_tokens: list
+ :type second_sent_tokens: list
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
+ :type distancefunc: function
+ :rtype: scipy.optimize.OptimizeResult
+ """
+ nb_tokens_first_sent = len(first_sent_tokens)
+ nb_tokens_second_sent = len(second_sent_tokens)
+
+ all_tokens = list(set(first_sent_tokens+second_sent_tokens))
+ wordvecs = {token: wvmodel[token] for token in all_tokens}
+
+ first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
+ second_sent_buckets = tokens_to_fracdict(second_sent_tokens)
+
+ collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j
+
+ # assigning T
+ T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent)
+ for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)):
+ T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]],
+ wordvecs[second_sent_tokens[j]])
+
+ # assigning Aeq and beq
+ Aeq = csr_matrix(
+ (nb_tokens_first_sent+nb_tokens_second_sent,
+ nb_tokens_first_sent*nb_tokens_second_sent)
+ )
+ beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent)
+ for i in range(nb_tokens_first_sent):
+ for j in range(nb_tokens_second_sent):
+ Aeq[i, collapsed_idx_func(i, j)] = 1.
+ beq[i] = first_sent_buckets[first_sent_tokens[i]]
+ for j in range(nb_tokens_second_sent):
+ for i in range(nb_tokens_first_sent):
+ Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1.
+ beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]]
+
+ return linprog(T, A_eq=Aeq, b_eq=beq)
+
+
+def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
+ """ Compute the Word Mover's distance (WMD) between the two given lists of tokens.
+
+ Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding
+ model has to be provided. WMD is returned.
+
+ Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
+
+ :param first_sent_tokens: first list of tokens.
+ :param second_sent_tokens: second list of tokens.
+ :param wvmodel: word-embedding models.
+ :param distancefunc: distance function that takes two numpy ndarray.
+ :param lpFile: deprecated, kept for backward incompatibility. (default: None)
+ :return: Word Mover's distance (WMD)
+ :type first_sent_tokens: list
+ :type second_sent_tokens: list
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
+ :type distancefunc: function
+ :type lpFile: str
+ :rtype: float
+ """
+ linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel,
+ distancefunc=distancefunc)
+ if lpFile is not None:
+ warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \
+ 'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \
+ 'this parameter.')
+ return linprog_result['fun']
diff --git a/shorttext/smartload.py b/src/shorttext/smartload.py
similarity index 77%
rename from shorttext/smartload.py
rename to src/shorttext/smartload.py
index d99307aa..9171b4a1 100644
--- a/shorttext/smartload.py
+++ b/src/shorttext/smartload.py
@@ -2,13 +2,15 @@
from .utils import standard_text_preprocessor_1
from .utils import compactmodel_io as cio
from .utils import classification_exceptions as e
+from .utils import load_DocumentTermMatrix
from .classifiers import load_varnnlibvec_classifier, load_sumword2vec_classifier
from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel
+from .generators import loadSeq2SeqWithKeras, loadCharBasedSeq2SeqGenerator
from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier
from .classifiers import load_maxent_classifier
-def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1()):
+def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None):
""" Load appropriate classifier or model from the binary model.
The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed.
@@ -16,11 +18,13 @@ def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_prepro
:param filename: path of the compact model file
:param wvmodel: Word2Vec model
:param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
+ :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:return: appropriate classifier or model
:raise: AlgorithmNotExistException
:type filename: str
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type preprocessor: function
+ :type vecsize: int
"""
classifier_name = cio.get_model_classifier_name(filename)
if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']:
@@ -36,10 +40,16 @@ def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_prepro
else:
raise e.AlgorithmNotExistException(topicmodel)
elif classifier_name in ['nnlibvec']:
- return load_varnnlibvec_classifier(wvmodel, filename, compact=True)
+ return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize)
elif classifier_name in ['sumvec']:
- return load_sumword2vec_classifier(wvmodel, filename, compact=True)
+ return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize)
elif classifier_name in ['maxent']:
return load_maxent_classifier(filename, compact=True)
+ elif classifier_name in ['dtm']:
+ return load_DocumentTermMatrix(filename, compact=True)
+ elif classifier_name in ['kerasseq2seq']:
+ return loadSeq2SeqWithKeras(filename, compact=True)
+ elif classifier_name in ['charbases2s']:
+ return loadCharBasedSeq2SeqGenerator(filename, compact=True)
else:
raise e.AlgorithmNotExistException(classifier_name)
\ No newline at end of file
diff --git a/src/shorttext/spell/__init__.py b/src/shorttext/spell/__init__.py
new file mode 100644
index 00000000..9515448a
--- /dev/null
+++ b/src/shorttext/spell/__init__.py
@@ -0,0 +1,5 @@
+
+from .basespellcorrector import SpellCorrector
+
+from .norvig import NorvigSpellCorrector
+
diff --git a/src/shorttext/spell/basespellcorrector.py b/src/shorttext/spell/basespellcorrector.py
new file mode 100644
index 00000000..4201beac
--- /dev/null
+++ b/src/shorttext/spell/basespellcorrector.py
@@ -0,0 +1,31 @@
+
+from abc import ABC, abstractmethod
+
+from ..utils.classification_exceptions import NotImplementedException
+
+
+class SpellCorrector(ABC):
+ """ Base class for all spell corrector.
+
+ This class is not implemented; this is an "abstract class."
+
+ """
+ @abstractmethod
+ def train(self, text):
+ """ Train the spell corrector with the given corpus.
+
+ :param text: training corpus
+ :type text: str
+ """
+ raise NotImplementedException()
+
+ @abstractmethod
+ def correct(self, word):
+ """ Recommend a spell correction to given the word.
+
+ :param word: word to be checked
+ :return: recommended correction
+ :type word: str
+ :rtype: str
+ """
+ return word
diff --git a/src/shorttext/spell/editor.py b/src/shorttext/spell/editor.py
new file mode 100644
index 00000000..0a501204
--- /dev/null
+++ b/src/shorttext/spell/editor.py
@@ -0,0 +1,22 @@
+
+import numba as nb
+
+
+@nb.njit
+def compute_set_edits1(word):
+ letters = 'abcdefghijklmnopqrstuvwxyz'
+
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+ deletes = [L + R[1:] for L, R in splits if R]
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
+ inserts = [L + c + R for L, R in splits for c in letters]
+
+ returned_set = set(deletes + transposes + replaces + inserts)
+
+ return returned_set
+
+
+@nb.njit
+def compute_set_edits2(word):
+ return (e2 for e1 in compute_set_edits1(word) for e2 in compute_set_edits1(e1))
diff --git a/src/shorttext/spell/norvig.py b/src/shorttext/spell/norvig.py
new file mode 100644
index 00000000..f24be4ed
--- /dev/null
+++ b/src/shorttext/spell/norvig.py
@@ -0,0 +1,70 @@
+
+# reference: https://norvig.com/spell-correct.html
+
+import re
+from collections import Counter
+
+from . import SpellCorrector
+from .editor import compute_set_edits1, compute_set_edits2
+
+
+class NorvigSpellCorrector(SpellCorrector):
+ """ Spell corrector described by Peter Norvig in his blog. (https://norvig.com/spell-correct.html)
+
+ """
+ def __init__(self):
+ """ Instantiate the class
+
+ """
+ self.train('')
+
+ def train(self, text):
+ """ Given the text, train the spell corrector.
+
+ :param text: training corpus
+ :type text: str
+ """
+ self.words = re.findall('\\w+', text.lower())
+ self.WORDS = Counter(self.words)
+ self.N = sum(self.WORDS.values())
+
+ def P(self, word):
+ """ Compute the probability of the words randomly sampled from the training corpus.
+
+ :param word: a word
+ :return: probability of the word sampled randomly in the corpus
+ :type word: str
+ :rtype: float
+ """
+ return self.WORDS[word] / float(self.N)
+
+ def correct(self, word):
+ """ Recommend a spelling correction to the given word
+
+ :param word: a word
+ :return: recommended correction
+ :type word: str
+ :rtype: str
+ """
+ return max(self.candidates(word), key=self.P)
+
+ def known(self, words):
+ """ Filter away the words that are not found in the training corpus.
+
+ :param words: list of words
+ :return: list of words that can be found in the training corpus
+ :type words: list
+ :rtype: list
+ """
+ return set(w for w in words if w in self.WORDS)
+
+ def candidates(self, word):
+ """ List potential candidates for corrected spelling to the given words.
+
+ :param word: a word
+ :return: list of recommended corrections
+ :type word: str
+ :rtype: list
+ """
+ return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word])
+
diff --git a/shorttext/stack/__init__.py b/src/shorttext/stack/__init__.py
similarity index 100%
rename from shorttext/stack/__init__.py
rename to src/shorttext/stack/__init__.py
diff --git a/shorttext/stack/stacking.py b/src/shorttext/stack/stacking.py
similarity index 88%
rename from shorttext/stack/stacking.py
rename to src/shorttext/stack/stacking.py
index a7888f33..7bbb6c09 100644
--- a/shorttext/stack/stacking.py
+++ b/src/shorttext/stack/stacking.py
@@ -1,16 +1,19 @@
+
import pickle
+from abc import ABC, abstractmethod
import numpy as np
-from keras.layers import Dense, Reshape
-from keras.models import Sequential
-from keras.regularizers import l2
+from tensorflow.keras.layers import Dense, Reshape
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.regularizers import l2
+
+from ..utils import classification_exceptions as e
+from ..utils import kerasmodel_io as kerasio
+from ..utils.compactmodel_io import CompactIOMachine
-import shorttext.utils.classification_exceptions as e
-import shorttext.utils.kerasmodel_io as kerasio
-import shorttext.utils.compactmodel_io as cio
# abstract class
-class StackedGeneralization:
+class StackedGeneralization(ABC):
"""
This is an abstract class for any stacked generalization method. It is an intermediate model
that takes the results of other classifiers as the input features, and perform another classification.
@@ -24,13 +27,13 @@ class StackedGeneralization:
M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
*WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
"""
- def __init__(self, intermediate_classifiers={}):
+ def __init__(self, intermediate_classifiers=None):
""" Initialize the stacking class instance.
:param intermediate_classifiers: dictionary, with key being a string, and the values intermediate classifiers, that have the method :func:`~score`, which takes a string as the input argument.
:type intermediate_classifiers: dict
"""
- self.classifiers = intermediate_classifiers
+ self.classifiers = intermediate_classifiers if intermediate_classifiers is not None else {}
self.classlabels = []
self.trained = False
@@ -115,7 +118,7 @@ def convert_label_to_buckets(self, label):
:type label: str
:rtype: numpy.ndarray
"""
- buckets = np.zeros(len(self.labels2idx), dtype=np.int)
+ buckets = np.zeros(len(self.labels2idx), dtype=np.int_)
buckets[self.labels2idx[label]] = 1
return buckets
@@ -135,6 +138,7 @@ def convert_traindata_matrix(self, classdict, tobucket=True):
X = self.translate_shorttext_intfeature_matrix(shorttext)
yield X, y
+ @abstractmethod
def train(self, classdict, *args, **kwargs):
""" Train the stacked generalization.
@@ -151,6 +155,7 @@ def train(self, classdict, *args, **kwargs):
"""
raise e.NotImplementedException()
+ @abstractmethod
def score(self, shorttext, *args, **kwargs):
""" Calculate the scores for each class labels.
@@ -168,9 +173,8 @@ def score(self, shorttext, *args, **kwargs):
"""
raise e.NotImplementedException()
-@cio.compactio({'classifier': 'stacked_logistics'}, 'stacked_logistics',
- ['_stackedlogistics.pkl', '_stackedlogistics.h5', '_stackedlogistics.json'])
-class LogisticStackedGeneralization(StackedGeneralization):
+
+class LogisticStackedGeneralization(StackedGeneralization, CompactIOMachine):
"""
This class implements logistic regression as the stacked generalizer.
@@ -181,6 +185,13 @@ class LogisticStackedGeneralization(StackedGeneralization):
The classifiers must have the :func:`~score` method that takes a string as an input argument.
"""
+ def __init__(self, intermediate_classifiers={}):
+ CompactIOMachine.__init__(self,
+ {'classifier': 'stacked_logistics'},
+ 'stacked_logistics',
+ ['_stackedlogistics.pkl', '_stackedlogistics.weights.h5', '_stackedlogistics.json'])
+ StackedGeneralization.__init__(self, intermediate_classifiers=intermediate_classifiers)
+
def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epoch=1000):
""" Train the stacked generalization.
@@ -196,6 +207,7 @@ def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epo
:type bias_l2reg: float
:type nb_epoch: int
"""
+
# register
self.register_classifiers()
self.register_classlabels(classdict.keys())
@@ -211,8 +223,8 @@ def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epo
kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer)
Xy = [(xone, yone) for xone, yone in self.convert_traindata_matrix(classdict, tobucket=True)]
- X = np.array(map(lambda item: item[0], Xy))
- y = np.array(map(lambda item: item[1], Xy))
+ X = np.array([item[0] for item in Xy])
+ y = np.array([item[1] for item in Xy])
kmodel.fit(X, y, epochs=nb_epoch)
@@ -261,7 +273,7 @@ def savemodel(self, nameprefix):
stackedmodeldict = {'classifiers': self.classifier2idx,
'classlabels': self.classlabels}
- pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'w'))
+ pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'wb'))
kerasio.save_model(nameprefix+'_stackedlogistics', self.model)
def loadmodel(self, nameprefix):
@@ -274,13 +286,10 @@ def loadmodel(self, nameprefix):
:return: None
:type nameprefix: str
"""
- stackedmodeldict = pickle.load(open(nameprefix+'_stackedlogistics.pkl', 'r'))
+ stackedmodeldict = pickle.load(open(nameprefix+'_stackedlogistics.pkl', 'rb'))
self.register_classlabels(stackedmodeldict['classlabels'])
self.classifier2idx = stackedmodeldict['classifiers']
- self.idx2classifier = {}
- for key, val in self.classifier2idx.items():
- self.idx2classifier[val] = key
-
+ self.idx2classifier = {val: key for key, val in self.classifier2idx.items()}
self.model = kerasio.load_model(nameprefix+'_stackedlogistics')
self.trained = True
diff --git a/src/shorttext/utils/__init__.py b/src/shorttext/utils/__init__.py
new file mode 100644
index 00000000..a74f960e
--- /dev/null
+++ b/src/shorttext/utils/__init__.py
@@ -0,0 +1,19 @@
+
+from . import misc
+from . import kerasmodel_io
+from . import classification_exceptions
+from . import gensim_corpora
+from . import textpreprocessing
+from . import compactmodel_io
+from . import dtm
+
+from .textpreprocessing import tokenize, stemword
+from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1, standard_text_preprocessor_2
+
+from .wordembed import load_word2vec_model, load_fasttext_model, load_poincare_model, shorttext_to_avgvec
+from .wordembed import RESTfulKeyedVectors
+from .dtm import load_DocumentTermMatrix
+
+from .dtm import DocumentTermMatrix, load_DocumentTermMatrix
+
+
diff --git a/shorttext/utils/classification_exceptions.py b/src/shorttext/utils/classification_exceptions.py
similarity index 70%
rename from shorttext/utils/classification_exceptions.py
rename to src/shorttext/utils/classification_exceptions.py
index a2e758ad..97184a93 100644
--- a/shorttext/utils/classification_exceptions.py
+++ b/src/shorttext/utils/classification_exceptions.py
@@ -1,24 +1,37 @@
+
+
+
class ModelNotTrainedException(Exception):
def __init__(self):
self.message = 'Model not trained.'
+
class AlgorithmNotExistException(Exception):
def __init__(self, algoname):
self.message = 'Algorithm '+algoname+' not exist.'
-class Word2VecModelNotExistException(Exception):
+
+class WordEmbeddingModelNotExistException(Exception):
def __init__(self, path):
- self.message = 'Given path of Word2Vec not exist: '+path
+ self.message = 'Given path of the word-embedding model not exist: '+path
+
class UnequalArrayLengthsException(Exception):
def __init__(self, arr1, arr2):
self.message = 'Unequal lengths: '+str(len(arr1))+" and "+str(len(arr2))
+
class NotImplementedException(Exception):
def __init__(self):
self.message = 'Method not implemented.'
+
class IncorrectClassificationModelFileException(Exception):
def __init__(self, expectedname, actualname):
- self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')'
\ No newline at end of file
+ self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')'
+
+
+class OperationNotDefinedException(Exception):
+ def __init__(self, opname):
+ self.message = 'Operation '+opname+' not defined'
diff --git a/shorttext/utils/compactmodel_io.py b/src/shorttext/utils/compactmodel_io.py
similarity index 62%
rename from shorttext/utils/compactmodel_io.py
rename to src/shorttext/utils/compactmodel_io.py
index 6c2354d7..ee775479 100644
--- a/shorttext/utils/compactmodel_io.py
+++ b/src/shorttext/utils/compactmodel_io.py
@@ -13,8 +13,10 @@
from functools import partial
from . import classification_exceptions as e
+from deprecation import deprecated
-def removedir(dir):
+
+def removedir(dir: str):
""" Remove all subdirectories and files under the specified path.
:param dir: path of the directory to be clean
@@ -23,9 +25,9 @@ def removedir(dir):
for filename in os.listdir(dir):
if os.path.isdir(filename):
removedir(os.path.join(dir, filename))
- os.rmdir(os.path.isdir(filename))
+ os.rmdir(os.path.join(filename))
else:
- os.remove(dir+'/'+filename)
+ os.remove(os.path.join(dir, filename))
os.rmdir(dir)
@@ -49,7 +51,7 @@ def save_compact_model(filename, savefunc, prefix, suffices, infodict):
savefunc(tempdir+'/'+prefix)
# zipping
- outputfile = zipfile.ZipFile(filename, mode='w')
+ outputfile = zipfile.ZipFile(filename, mode='w', allowZip64 = True)
for suffix in suffices:
outputfile.write(tempdir+'/'+prefix+suffix, prefix+suffix)
outputfile.writestr('modelconfig.json', json.dumps(infodict))
@@ -58,6 +60,7 @@ def save_compact_model(filename, savefunc, prefix, suffices, infodict):
# delete temporary files
removedir(tempdir)
+
def load_compact_model(filename, loadfunc, prefix, infodict):
""" Load a model from a compact file that contains multiple files related to the model.
@@ -93,7 +96,81 @@ def load_compact_model(filename, loadfunc, prefix, infodict):
return returnobj
-# decorator that adds compact model methods to classifier dynamically
+
+class CompactIOMachine:
+ """ Base class that implements compact model I/O.
+
+ This is to replace the original :func:`compactio` decorator.
+
+ """
+ def __init__(self, infodict, prefix, suffices):
+ """
+
+ :param infodict: information about the model. Must contain the key 'classifier'.
+ :param prefix: prefix of names of the model file
+ :param suffices: suffices of the names of the model file
+ :type infodict: dict
+ :type prefix: str
+ :type suffices: list
+ """
+ self.infodict = infodict
+ self.prefix = prefix
+ self.suffices = suffices
+
+ def savemodel(self, nameprefix):
+ """ Abstract method for `savemodel`.
+
+ :param nameprefix: prefix of the model path
+ :type nameprefix: str
+ """
+ raise e.OperationNotDefinedException()
+
+ def loadmodel(self, nameprefix):
+ """ Abstract method for `loadmodel`.
+
+ :param nameprefix: prefix of the model path
+ :type nameprefix: str
+ """
+ raise e.OperationNotDefinedException()
+
+ def save_compact_model(self, filename, *args, **kwargs):
+ """ Save the model in a compressed binary format.
+
+ :param filename: name of the model file
+ :param args: arguments
+ :param kwargs: arguments
+ :type filename: str
+ :type args: dict
+ :type kwargs: dict
+ """
+ save_compact_model(filename, self.savemodel, self.prefix, self.suffices, self.infodict, *args, **kwargs)
+
+ def load_compact_model(self, filename, *args, **kwargs):
+ """ Load the model in a compressed binary format.
+
+ :param filename: name of the model file
+ :param args: arguments
+ :param kwargs: arguments
+ :type filename: str
+ :type args: dict
+ :type kwargs: dict
+ """
+ return load_compact_model(filename, self.loadmodel, self.prefix, self.infodict, *args, **kwargs)
+
+ def get_info(self):
+ """ Getting information for the dressed machine.
+
+ :return: dictionary of the information for the dressed machine.
+ :rtype: dict
+ """
+ return {'classifier': self.infodict['classifier'],
+ 'prefix': self.prefix,
+ 'suffices': self.suffices}
+
+
+# decorator that adds compact model methods to classifier dynamically (deprecated)
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+ details="Use `CompactIOMachine` instead")
def CompactIOClassifier(Classifier, infodict, prefix, suffices):
""" Returns a decorated class object with additional methods for compact model I/O.
@@ -113,21 +190,27 @@ def CompactIOClassifier(Classifier, infodict, prefix, suffices):
"""
# define the inherit class
class DressedClassifier(Classifier):
- def save_compact_model(self, filename):
- save_compact_model(filename, self.savemodel, prefix, suffices, infodict)
+ def save_compact_model(self, filename, *args, **kwargs):
+ save_compact_model(filename, self.savemodel, prefix, suffices, infodict, *args, **kwargs)
- def load_compact_model(self, filename):
- return load_compact_model(filename, self.loadmodel, prefix, infodict)
+ def load_compact_model(self, filename, *args, **kwargs):
+ return load_compact_model(filename, self.loadmodel, prefix, infodict, *args, **kwargs)
def get_info(self):
return {'classifier': infodict['classifier'],
'prefix': prefix,
'suffices': suffices}
+ DressedClassifier.__name__ = Classifier.__name__
+ DressedClassifier.__doc__ = Classifier.__doc__
+
# return decorated classifier
return DressedClassifier
-# decorator for use
+
+# decorator for use (deprecated)
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+ details="Use `CompactIOMachine` instead")
def compactio(infodict, prefix, suffices):
""" Returns a decorator that performs the decoration by :func:`CompactIOClassifier`.
@@ -142,6 +225,7 @@ def compactio(infodict, prefix, suffices):
"""
return partial(CompactIOClassifier, infodict=infodict, prefix=prefix, suffices=suffices)
+
def get_model_config_field(filename, parameter):
""" Return the configuration parameter of a model file.
@@ -156,9 +240,15 @@ def get_model_config_field(filename, parameter):
:rtype: str
"""
inputfile = zipfile.ZipFile(filename, mode='r')
- readinfodict = json.load(inputfile.open('modelconfig.json', 'r'))
+ modelconfig_file = inputfile.open('modelconfig.json', 'r')
+ modelconfig_json = modelconfig_file.read()
+ modelconfig_file.close()
+ if type(modelconfig_json)==bytes:
+ modelconfig_json = modelconfig_json.decode('utf-8')
+ readinfodict = json.loads(modelconfig_json)
return readinfodict[parameter]
+
def get_model_classifier_name(filename):
""" Return the name of the classifier from a model file.
@@ -170,4 +260,4 @@ def get_model_classifier_name(filename):
:type filename: str
:rtype: str
"""
- return get_model_config_field(filename, 'classifier')
\ No newline at end of file
+ return get_model_config_field(filename, 'classifier')
diff --git a/src/shorttext/utils/dtm.py b/src/shorttext/utils/dtm.py
new file mode 100644
index 00000000..6912c100
--- /dev/null
+++ b/src/shorttext/utils/dtm.py
@@ -0,0 +1,339 @@
+
+import pickle
+from typing import Optional, Any
+from types import FunctionType
+
+import numpy as np
+import npdict
+from gensim.corpora import Dictionary
+from gensim.models import TfidfModel
+from npdict import SparseArrayWrappedDict
+from scipy.sparse import dok_matrix
+from deprecation import deprecated
+from nptyping import NDArray, Shape, Int
+
+from .compactmodel_io import CompactIOMachine
+from .classification_exceptions import NotImplementedException
+from .textpreprocessing import advanced_text_tokenizer_1
+
+
+dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl']
+npdtm_suffices = []
+
+
+def generate_npdict_document_term_matrix(
+ corpus: list[str],
+ doc_ids: list[Any],
+ tokenize_func: FunctionType
+) -> npdict.NumpyNDArrayWrappedDict:
+ # grabbing tokens from each document in the corpus
+ doc_tokens = [tokenize_func(document) for document in corpus]
+ tokens_set = set([
+ token
+ for document in doc_tokens
+ for token in document
+ ])
+ npdtm = npdict.SparseArrayWrappedDict(
+ [doc_ids, sorted(list(tokens_set))],
+ default_initial_value=0.0
+ )
+ for doc_id, document in zip(doc_ids, doc_tokens):
+ for token in document:
+ npdtm[doc_id, token] += 1
+ return npdtm
+
+
+def compute_document_frequency(
+ npdtm: npdict.NumpyNDArrayWrappedDict
+) -> NDArray[Shape["*"], Int]:
+ if isinstance(npdtm, npdict.SparseArrayWrappedDict):
+ return np.sum(npdtm.to_coo() > 0, axis=0).todense()
+ else:
+ return np.sum(npdtm.to_numpy() > 0, axis=0)
+
+
+def compute_tfidf_document_term_matrix(
+ npdtm: npdict.NumpyNDArrayWrappedDict,
+ sparse: bool=True
+) -> npdict.NumpyNDArrayWrappedDict:
+ doc_frequencies = compute_document_frequency(npdtm)
+ nbdocs = npdtm.dimension_sizes[0]
+ if isinstance(npdtm, npdict.SparseArrayWrappedDict):
+ new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies)
+ return npdict.SparseArrayWrappedDict.generate_dict(new_dtm_sparray, dense=not sparse)
+ else:
+ new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies)
+ new_npdtm = npdict.NumpyNDArrayWrappedDict.generate_dict(new_dtm_nparray)
+ if sparse:
+ new_sparse_dtm = npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict(
+ new_npdtm, default_initial_value=0.0
+ )
+ return new_sparse_dtm
+ else:
+ return new_npdtm
+
+
+class NumpyDocumentTermMatrix(CompactIOMachine):
+ def __init__(
+ self,
+ corpus: Optional[list[str]]=None,
+ docids: Optional[list[Any]]=None,
+ tfidf: bool=False,
+ tokenize_func: Optional[FunctionType]=None
+ ):
+ CompactIOMachine.__init__(self, {'classifier': 'npdtm'}, 'dtm', dtm_suffices)
+ self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1
+
+ # generate DTM
+ if corpus is not None:
+ self.generate_dtm(corpus, docids=docids, tfidf=tfidf)
+
+ def generate_dtm(
+ self,
+ corpus: list[str],
+ docids: Optional[list[Any]]=None,
+ tfidf: bool=False
+ ):
+ # wrangling document IDs
+ if docids is None:
+ doc_ids = [f"doc{i}" for i in range(len(corpus))]
+ else:
+ if len(docids) == len(corpus):
+ doc_ids = docids
+ elif len(docids) > len(corpus):
+ doc_ids = docids[:len(corpus)]
+ else:
+ doc_ids = docids + [f"doc{i}" for i in range(len(docids), len(corpus))]
+
+ self.npdtm = generate_npdict_document_term_matrix(corpus, doc_ids, self.tokenize_func)
+
+ if tfidf:
+ self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True)
+
+ def get_termfreq(self, docid: str, token: str) -> float:
+ return self.npdtm[docid, token]
+
+ def get_total_termfreq(self, token: str) -> float:
+ token_index = self.npdtm._keystrings_to_indices[1][token]
+ if isinstance(self.npdtm, SparseArrayWrappedDict):
+ matrix = self.npdtm.to_coo()
+ else:
+ matrix = self.npdtm.to_numpy()
+ return np.sum(matrix[:, token_index])
+
+ def get_doc_frequency(self, token) -> int:
+ token_index = self.npdtm._keystrings_to_indices[1][token]
+ if isinstance(self.npdtm, npdict.SparseArrayWrappedDict):
+ freq_array = self.npdtm.to_coo()[:, token_index]
+ return np.sum(freq_array > 0, axis=0).todense()
+ else:
+ freq_array = self.npdtm.to_numpy()[:, token_index]
+ return np.sum(freq_array > 0, axis=0)
+
+ def get_token_occurences(self, token: str) -> dict[str, float]:
+ return {
+ docid: self.npdtm[docid, token]
+ for docid in self.npdtm._lists_keystrings[0]
+ }
+
+ def get_doc_tokens(self, docid: str) -> dict[str, float]:
+ return {
+ token: self.npdtm[docid, token]
+ for token in self.npdtm._lists_keystrings[1]
+ }
+
+
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+ details="Use `NumpyDocumentTermMatrix` instead")
+class DocumentTermMatrix(CompactIOMachine):
+ """ Document-term matrix for corpus.
+
+ This is a class that handles the document-term matrix (DTM). With a given corpus, users can
+ retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf
+ can be applied.
+ """
+ def __init__(self, corpus, docids=None, tfidf=False):
+ """ Initialize the document-term matrix (DTM) class with a given corpus.
+
+ If document IDs (docids) are given, it will be stored and output as approrpriate.
+ If not, the documents are indexed by numbers.
+
+ Users can choose to weigh by tf-idf. The default is not to weigh.
+
+ The corpus has to be a list of lists, with each of the inside list contains all the tokens
+ in each document.
+
+ :param corpus: corpus.
+ :param docids: list of designated document IDs. (Default: None)
+ :param tfidf: whether to weigh using tf-idf. (Default: False)
+ :type corpus: list
+ :type docids: list
+ :type tfidf: bool
+ """
+ CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices)
+ if docids is None:
+ self.docid_dict = {i: i for i in range(len(corpus))}
+ self.docids = [i for i in range(len(corpus))]
+ else:
+ if len(docids) == len(corpus):
+ self.docid_dict = {docid: i for i, docid in enumerate(docids)}
+ self.docids = docids
+ elif len(docids) > len(corpus):
+ self.docid_dict = {docid: i for i, docid in zip(range(len(corpus)), docids[:len(corpus)])}
+ self.docids = docids[:len(corpus)]
+ else:
+ self.docid_dict = {docid: i for i, docid in enumerate(docids)}
+ self.docid_dict = {i: i for i in range(len(docids), len(corpus))}
+ self.docids = docids + [i for i in range(len(docids), len(corpus))]
+ # generate DTM
+ self.generate_dtm(corpus, tfidf=tfidf)
+
+ def generate_dtm(self, corpus, tfidf=False):
+ """ Generate the inside document-term matrix and other peripherical information
+ objects. This is run when the class is instantiated.
+
+ :param corpus: corpus.
+ :param tfidf: whether to weigh using tf-idf. (Default: False)
+ :return: None
+ :type corpus: list
+ :type tfidf: bool
+ """
+ self.dictionary = Dictionary(corpus)
+ self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float64)
+ bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
+ if tfidf:
+ weighted_model = TfidfModel(bow_corpus)
+ bow_corpus = weighted_model[bow_corpus]
+ for docid in self.docids:
+ for tokenid, count in bow_corpus[self.docid_dict[docid]]:
+ self.dtm[self.docid_dict[docid], tokenid] = count
+
+ def get_termfreq(self, docid, token):
+ """ Retrieve the term frequency of a given token in a particular document.
+
+ Given a token and a particular document ID, compute the term frequency for this
+ token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted
+ term frequency.
+
+ :param docid: document ID
+ :param token: term or token
+ :return: term frequency or weighted term frequency of the given token in this document (designated by docid)
+ :type docid: any
+ :type token: str
+ :rtype: numpy.float
+ """
+ return self.dtm[self.docid_dict[docid], self.dictionary.token2id[token]]
+
+ def get_total_termfreq(self, token):
+ """ Retrieve the total occurrences of the given token.
+
+ Compute the total occurrences of the term in all documents. If `tfidf` is set to `True`
+ while instantiating the class, it returns the sum of weighted term frequency.
+
+ :param token: term or token
+ :return: total occurrences of the given token
+ :type token: str
+ :rtype: numpy.float
+ """
+ return sum(self.dtm[:, self.dictionary.token2id[token]].values())
+
+ def get_doc_frequency(self, token):
+ """ Retrieve the document frequency of the given token.
+
+ Compute the document frequency of the given token, i.e., the number of documents
+ that this token can be found.
+
+ :param token: term or token
+ :return: document frequency of the given token
+ :type token: str
+ :rtype: int
+ """
+ return len(self.dtm[:, self.dictionary.token2id[token]].values())
+
+ def get_token_occurences(self, token):
+ """ Retrieve the term frequencies of a given token in all documents.
+
+ Compute the term frequencies of the given token for all the documents. If `tfidf` is
+ set to be `True` while instantiating the class, it returns the weighted term frequencies.
+
+ This method returns a dictionary of term frequencies with the corresponding document IDs
+ as the keys.
+
+ :param token: term or token
+ :return: a dictionary of term frequencies with the corresponding document IDs as the keys
+ :type token: str
+ :rtype: dict
+ """
+ return {self.docids[docidx]: count for (docidx, _), count in self.dtm[:, self.dictionary.token2id[token]].items()}
+
+ def get_doc_tokens(self, docid):
+ """ Retrieve the term frequencies of all tokens in the given document.
+
+ Compute the term frequencies of all tokens for the given document. If `tfidf` is
+ set to be `True` while instantiating the class, it returns the weighted term frequencies.
+
+ This method returns a dictionary of term frequencies with the tokens as the keys.
+
+ :param docid: document ID
+ :return: a dictionary of term frequencies with the tokens as the keys
+ :type docid: any
+ :rtype: dict
+ """
+ return {self.dictionary[tokenid]: count for (_, tokenid), count in self.dtm[self.docid_dict[docid], :].items()}
+
+ def generate_dtm_dataframe(self):
+ """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3)
+
+ Now it raises exception.
+
+ :return: data frame of the document-term matrix
+ :rtype: pandas.DataFrame
+ :raise: NotImplementedException
+ """
+ raise NotImplementedException()
+
+ def savemodel(self, prefix):
+ """ Save the model.
+
+ :param prefix: prefix of the files
+ :return: None
+ :type prefix: str
+ """
+ pickle.dump(self.docids, open(prefix+'_docids.pkl', 'wb'))
+ self.dictionary.save(prefix+'_dictionary.dict')
+ pickle.dump(self.dtm, open(prefix+'_dtm.pkl', 'wb'))
+
+ def loadmodel(self, prefix):
+ """ Load the model.
+
+ :param prefix: prefix of the files
+ :return: None
+ :type prefix: str
+ """
+ self.docids = pickle.load(open(prefix+'_docids.pkl', 'rb'))
+ self.docid_dict = {docid: i for i, docid in enumerate(self.docids)}
+ self.dictionary = Dictionary.load(prefix+'_dictionary.dict')
+ self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb'))
+
+
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+ details="Use `npdict` instead")
+def load_DocumentTermMatrix(filename, compact=True):
+ """ Load presaved Document-Term Matrix (DTM).
+
+ Given the file name (if `compact` is `True`) or the prefix (if `compact` is `False`),
+ return the document-term matrix.
+
+ :param filename: file name or prefix
+ :param compact: whether it is a compact model. (Default: `True`)
+ :return: document-term matrix
+ :type filename: str
+ :type compact: bool
+ :rtype: DocumentTermMatrix
+ """
+ dtm = DocumentTermMatrix([[]])
+ if compact:
+ dtm.load_compact_model(filename)
+ else:
+ dtm.loadmodel(filename)
+ return dtm
\ No newline at end of file
diff --git a/shorttext/utils/gensim_corpora.py b/src/shorttext/utils/gensim_corpora.py
similarity index 85%
rename from shorttext/utils/gensim_corpora.py
rename to src/shorttext/utils/gensim_corpora.py
index 5c58250c..9d0de0d6 100644
--- a/shorttext/utils/gensim_corpora.py
+++ b/src/shorttext/utils/gensim_corpora.py
@@ -1,5 +1,10 @@
+
+from collections import defaultdict
+
import gensim
-from .textpreprocessing import spacy_tokenize as tokenize
+
+from .textpreprocessing import tokenize
+
def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize):
""" Generate gensim bag-of-words dictionary and corpus.
@@ -21,6 +26,7 @@ def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize):
corpus = [dictionary.doc2bow(doctokens) for doctokens in doc]
return dictionary, corpus, classlabels
+
def save_corpus(dictionary, corpus, prefix):
""" Save gensim corpus and dictionary.
@@ -71,3 +77,18 @@ def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokeni
corpus += newcorpus
return corpus, newcorpus
+
+
+def tokens_to_fracdict(tokens):
+ """ Return normalized bag-of-words (BOW) vectors.
+
+ :param tokens: list of tokens.
+ :type tokens: list
+ :return: normalized vectors of counts of tokens as a `dict`
+ :rtype: dict
+ """
+ cntdict = defaultdict(lambda : 0)
+ for token in tokens:
+ cntdict[token] += 1
+ totalcnt = sum(cntdict.values())
+ return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}
\ No newline at end of file
diff --git a/shorttext/utils/kerasmodel_io.py b/src/shorttext/utils/kerasmodel_io.py
similarity index 69%
rename from shorttext/utils/kerasmodel_io.py
rename to src/shorttext/utils/kerasmodel_io.py
index 720440b4..d772de7c 100644
--- a/shorttext/utils/kerasmodel_io.py
+++ b/src/shorttext/utils/kerasmodel_io.py
@@ -1,4 +1,6 @@
-from keras.models import model_from_json
+
+from tensorflow.keras.models import model_from_json
+
def save_model(nameprefix, model):
""" Save a keras sequential model into files.
@@ -10,11 +12,12 @@ def save_model(nameprefix, model):
:param model: keras sequential model to be saved
:return: None
:type nameprefix: str
- :type model: keras.models.Sequential
+ :type model: keras.models.Model
"""
model_json = model.to_json()
- open(nameprefix+'.json', 'wb').write(model_json)
- model.save_weights(nameprefix+'.h5')
+ open(nameprefix+'.json', 'w').write(model_json)
+ model.save_weights(nameprefix+'.weights.h5')
+
def load_model(nameprefix):
""" Load a keras sequential model from files.
@@ -25,8 +28,8 @@ def load_model(nameprefix):
:param nameprefix: Prefix of the paths of the model files
:return: keras sequential model
:type nameprefix: str
- :rtype: keras.models.Sequential
+ :rtype: keras.models.Model
"""
- model = model_from_json(open(nameprefix+'.json', 'rb').read())
- model.load_weights(nameprefix+'.h5')
+ model = model_from_json(open(nameprefix+'.json', 'r').read())
+ model.load_weights(nameprefix+'.weights.h5')
return model
\ No newline at end of file
diff --git a/src/shorttext/utils/misc.py b/src/shorttext/utils/misc.py
new file mode 100644
index 00000000..b1b6ad5e
--- /dev/null
+++ b/src/shorttext/utils/misc.py
@@ -0,0 +1,37 @@
+
+
+def textfile_generator(textfile, linebreak=True, encoding=None):
+ """ Return a generator that reads lines in a text file.
+
+ :param textfile: file object of a text file
+ :param linebreak: whether to return a line break at the end of each line (Default: True)
+ :param encoding: encoding of the text file (Default: None)
+ :return: a generator that reads lines in a text file
+ :type textfile: file
+ :type linebreak: bool
+ :type encoding: str
+ :rtype: generator
+ """
+ for t in textfile:
+ if len(t) > 0:
+ if encoding is None:
+ yield t.strip() + ('\n' if linebreak else '')
+ else:
+ yield t.decode(encoding).strip() + ('\n' if linebreak else '')
+
+
+class SinglePoolExecutor:
+ """ It is a wrapper for Python `map` functions.
+
+ """
+ def map(self, func, *iterables):
+ """ Refer to Python `map` documentation.
+
+ :param func: function
+ :param iterables: iterables to loop
+ :return: generator for the map
+ :type func: function
+ :type iterables: iterables
+ :rtype: map
+ """
+ return map(func, *iterables)
diff --git a/src/shorttext/utils/nonneg_stopwords.txt b/src/shorttext/utils/nonneg_stopwords.txt
new file mode 100644
index 00000000..8cd4eea2
--- /dev/null
+++ b/src/shorttext/utils/nonneg_stopwords.txt
@@ -0,0 +1,2778 @@
+og
+i
+jeg
+det
+at
+en
+den
+til
+er
+som
+på
+de
+med
+han
+af
+for
+ikke
+der
+var
+mig
+sig
+men
+et
+har
+om
+vi
+min
+havde
+ham
+hun
+nu
+over
+da
+fra
+du
+ud
+sin
+dem
+os
+op
+man
+hans
+hvor
+eller
+hvad
+skal
+selv
+her
+alle
+vil
+blev
+kunne
+ind
+når
+være
+dog
+noget
+ville
+jo
+deres
+efter
+ned
+skulle
+denne
+end
+dette
+mit
+også
+under
+have
+dig
+anden
+hende
+mine
+alt
+meget
+sit
+sine
+vor
+mod
+disse
+hvis
+din
+nogle
+hos
+blive
+mange
+ad
+bliver
+hendes
+været
+thi
+jer
+sådan
+de
+en
+van
+ik
+te
+dat
+die
+in
+een
+hij
+het
+niet
+zijn
+is
+was
+op
+aan
+met
+als
+voor
+had
+er
+maar
+om
+hem
+dan
+zou
+of
+wat
+mijn
+men
+dit
+zo
+door
+over
+ze
+zich
+bij
+ook
+tot
+je
+mij
+uit
+der
+daar
+haar
+naar
+heb
+hoe
+heeft
+hebben
+deze
+u
+want
+nog
+zal
+me
+zij
+nu
+ge
+geen
+omdat
+iets
+worden
+toch
+al
+waren
+veel
+meer
+doen
+toen
+moet
+ben
+zonder
+kan
+hun
+dus
+alles
+onder
+ja
+eens
+hier
+wie
+werd
+altijd
+doch
+wordt
+wezen
+kunnen
+ons
+zelf
+tegen
+na
+reeds
+wil
+kon
+niets
+uw
+iemand
+geweest
+andere
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+d
+ll
+m
+o
+re
+ve
+y
+ain
+aren
+couldn
+didn
+doesn
+hadn
+hasn
+haven
+isn
+ma
+mightn
+mustn
+needn
+shan
+shouldn
+wasn
+weren
+won
+wouldn
+olla
+olen
+olet
+on
+olemme
+olette
+ovat
+ole
+oli
+olisi
+olisit
+olisin
+olisimme
+olisitte
+olisivat
+olit
+olin
+olimme
+olitte
+olivat
+ollut
+olleet
+en
+et
+ei
+emme
+ette
+eivät
+minä
+minun
+minut
+minua
+minussa
+minusta
+minuun
+minulla
+minulta
+minulle
+sinä
+sinun
+sinut
+sinua
+sinussa
+sinusta
+sinuun
+sinulla
+sinulta
+sinulle
+hän
+hänen
+hänet
+häntä
+hänessä
+hänestä
+häneen
+hänellä
+häneltä
+hänelle
+me
+meidän
+meidät
+meitä
+meissä
+meistä
+meihin
+meillä
+meiltä
+meille
+te
+teidän
+teidät
+teitä
+teissä
+teistä
+teihin
+teillä
+teiltä
+teille
+he
+heidän
+heidät
+heitä
+heissä
+heistä
+heihin
+heillä
+heiltä
+heille
+tämä
+tämän
+tätä
+tässä
+tästä
+tähän
+tallä
+tältä
+tälle
+tänä
+täksi
+tuo
+tuon
+tuotä
+tuossa
+tuosta
+tuohon
+tuolla
+tuolta
+tuolle
+tuona
+tuoksi
+se
+sen
+sitä
+siinä
+siitä
+siihen
+sillä
+siltä
+sille
+sinä
+siksi
+nämä
+näiden
+näitä
+näissä
+näistä
+näihin
+näillä
+näiltä
+näille
+näinä
+näiksi
+nuo
+noiden
+noita
+noissa
+noista
+noihin
+noilla
+noilta
+noille
+noina
+noiksi
+ne
+niiden
+niitä
+niissä
+niistä
+niihin
+niillä
+niiltä
+niille
+niinä
+niiksi
+kuka
+kenen
+kenet
+ketä
+kenessä
+kenestä
+keneen
+kenellä
+keneltä
+kenelle
+kenenä
+keneksi
+ketkä
+keiden
+ketkä
+keitä
+keissä
+keistä
+keihin
+keillä
+keiltä
+keille
+keinä
+keiksi
+mikä
+minkä
+minkä
+mitä
+missä
+mistä
+mihin
+millä
+miltä
+mille
+minä
+miksi
+mitkä
+joka
+jonka
+jota
+jossa
+josta
+johon
+jolla
+jolta
+jolle
+jona
+joksi
+jotka
+joiden
+joita
+joissa
+joista
+joihin
+joilla
+joilta
+joille
+joina
+joiksi
+että
+ja
+jos
+koska
+kuin
+mutta
+niin
+sekä
+sillä
+tai
+vaan
+vai
+vaikka
+kanssa
+mukaan
+noin
+poikki
+yli
+kun
+niin
+nyt
+itse
+au
+aux
+avec
+ce
+ces
+dans
+de
+des
+du
+elle
+en
+et
+eux
+il
+je
+la
+le
+leur
+lui
+ma
+mais
+me
+même
+mes
+moi
+mon
+ne
+nos
+notre
+nous
+on
+ou
+par
+pas
+pour
+qu
+que
+qui
+sa
+se
+ses
+son
+sur
+ta
+te
+tes
+toi
+ton
+tu
+un
+une
+vos
+votre
+vous
+c
+d
+j
+l
+à
+m
+n
+s
+t
+y
+été
+étée
+étées
+étés
+étant
+étante
+étants
+étantes
+suis
+es
+est
+sommes
+êtes
+sont
+serai
+seras
+sera
+serons
+serez
+seront
+serais
+serait
+serions
+seriez
+seraient
+étais
+était
+étions
+étiez
+étaient
+fus
+fut
+fûmes
+fûtes
+furent
+sois
+soit
+soyons
+soyez
+soient
+fusse
+fusses
+fût
+fussions
+fussiez
+fussent
+ayant
+ayante
+ayantes
+ayants
+eu
+eue
+eues
+eus
+ai
+as
+avons
+avez
+ont
+aurai
+auras
+aura
+aurons
+aurez
+auront
+aurais
+aurait
+aurions
+auriez
+auraient
+avais
+avait
+avions
+aviez
+avaient
+eut
+eûmes
+eûtes
+eurent
+aie
+aies
+ait
+ayons
+ayez
+aient
+eusse
+eusses
+eût
+eussions
+eussiez
+eussent
+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daß
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+für
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+können
+könnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+über
+um
+und
+uns
+unsere
+unserem
+unseren
+unser
+unseres
+unter
+viel
+vom
+von
+vor
+während
+war
+waren
+warst
+was
+weg
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+würde
+würden
+zu
+zum
+zur
+zwar
+zwischen
+a
+ahogy
+ahol
+aki
+akik
+akkor
+alatt
+által
+általában
+amely
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelynek
+ami
+amit
+amolyan
+amíg
+amikor
+át
+abban
+ahhoz
+annak
+arra
+arról
+az
+azok
+azon
+azt
+azzal
+azért
+aztán
+azután
+azonban
+bár
+be
+belül
+benne
+cikk
+cikkek
+cikkeket
+csak
+de
+e
+eddig
+egész
+egy
+egyes
+egyetlen
+egyéb
+egyik
+egyre
+ekkor
+el
+elég
+ellen
+elõ
+elõször
+elõtt
+elsõ
+én
+éppen
+ebben
+ehhez
+emilyen
+ennek
+erre
+ez
+ezt
+ezek
+ezen
+ezzel
+ezért
+és
+fel
+felé
+hanem
+hiszen
+hogy
+hogyan
+igen
+így
+illetve
+ill.
+ill
+ilyen
+ilyenkor
+ison
+ismét
+itt
+jó
+jól
+jobban
+kell
+kellett
+keresztül
+keressünk
+ki
+kívül
+között
+közül
+legalább
+lehet
+lehetett
+legyen
+lenne
+lenni
+lesz
+lett
+maga
+magát
+majd
+majd
+már
+más
+másik
+meg
+még
+mellett
+mert
+mely
+melyek
+mi
+mit
+míg
+miért
+milyen
+mikor
+minden
+mindent
+mindenki
+mindig
+mint
+mintha
+mivel
+most
+nagy
+nagyobb
+nagyon
+ne
+néha
+nekem
+neki
+nem
+néhány
+nélkül
+nincs
+olyan
+ott
+össze
+õ
+õk
+õket
+pedig
+persze
+rá
+s
+saját
+sem
+semmi
+sok
+sokat
+sokkal
+számára
+szemben
+szerint
+szinte
+talán
+tehát
+teljes
+tovább
+továbbá
+több
+úgy
+ugyanis
+új
+újabb
+újra
+után
+utána
+utolsó
+vagy
+vagyis
+valaki
+valami
+valamint
+való
+vagyok
+van
+vannak
+volt
+voltam
+voltak
+voltunk
+vissza
+vele
+viszont
+volna
+ad
+al
+allo
+ai
+agli
+all
+agl
+alla
+alle
+con
+col
+coi
+da
+dal
+dallo
+dai
+dagli
+dall
+dagl
+dalla
+dalle
+di
+del
+dello
+dei
+degli
+dell
+degl
+della
+delle
+in
+nel
+nello
+nei
+negli
+nell
+negl
+nella
+nelle
+su
+sul
+sullo
+sui
+sugli
+sull
+sugl
+sulla
+sulle
+per
+tra
+contro
+io
+tu
+lui
+lei
+noi
+voi
+loro
+mio
+mia
+miei
+mie
+tuo
+tua
+tuoi
+tue
+suo
+sua
+suoi
+sue
+nostro
+nostra
+nostri
+nostre
+vostro
+vostra
+vostri
+vostre
+mi
+ti
+ci
+vi
+lo
+la
+li
+le
+gli
+ne
+il
+un
+uno
+una
+ma
+ed
+se
+perché
+anche
+come
+dov
+dove
+che
+chi
+cui
+non
+più
+quale
+quanto
+quanti
+quanta
+quante
+quello
+quelli
+quella
+quelle
+questo
+questi
+questa
+queste
+si
+tutto
+tutti
+a
+c
+e
+i
+l
+o
+ho
+hai
+ha
+abbiamo
+avete
+hanno
+abbia
+abbiate
+abbiano
+avrò
+avrai
+avrà
+avremo
+avrete
+avranno
+avrei
+avresti
+avrebbe
+avremmo
+avreste
+avrebbero
+avevo
+avevi
+aveva
+avevamo
+avevate
+avevano
+ebbi
+avesti
+ebbe
+avemmo
+aveste
+ebbero
+avessi
+avesse
+avessimo
+avessero
+avendo
+avuto
+avuta
+avuti
+avute
+sono
+sei
+è
+siamo
+siete
+sia
+siate
+siano
+sarò
+sarai
+sarà
+saremo
+sarete
+saranno
+sarei
+saresti
+sarebbe
+saremmo
+sareste
+sarebbero
+ero
+eri
+era
+eravamo
+eravate
+erano
+fui
+fosti
+fu
+fummo
+foste
+furono
+fossi
+fosse
+fossimo
+fossero
+essendo
+faccio
+fai
+facciamo
+fanno
+faccia
+facciate
+facciano
+farò
+farai
+farà
+faremo
+farete
+faranno
+farei
+faresti
+farebbe
+faremmo
+fareste
+farebbero
+facevo
+facevi
+faceva
+facevamo
+facevate
+facevano
+feci
+facesti
+fece
+facemmo
+faceste
+fecero
+facessi
+facesse
+facessimo
+facessero
+facendo
+sto
+stai
+sta
+stiamo
+stanno
+stia
+stiate
+stiano
+starò
+starai
+starà
+staremo
+starete
+staranno
+starei
+staresti
+starebbe
+staremmo
+stareste
+starebbero
+stavo
+stavi
+stava
+stavamo
+stavate
+stavano
+stetti
+stesti
+stette
+stemmo
+steste
+stettero
+stessi
+stesse
+stessimo
+stessero
+stando
+ах
+ох
+эх
+ай
+эй
+ой
+тағы
+тағыда
+әрине
+жоқ
+сондай
+осындай
+осылай
+солай
+мұндай
+бұндай
+мен
+сен
+ол
+біз
+біздер
+олар
+сіз
+сіздер
+маған
+оған
+саған
+біздің
+сіздің
+оның
+бізге
+сізге
+оларға
+біздерге
+сіздерге
+оларға
+менімен
+сенімен
+онымен
+бізбен
+сізбен
+олармен
+біздермен
+сіздермен
+менің
+сенің
+біздің
+сіздің
+оның
+біздердің
+сіздердің
+олардың
+маған
+саған
+оған
+менен
+сенен
+одан
+бізден
+сізден
+олардан
+біздерден
+сіздерден
+олардан
+айтпақшы
+сонымен
+сондықтан
+бұл
+осы
+сол
+анау
+мынау
+сонау
+осынау
+ана
+мына
+сона
+әні
+міне
+өй
+үйт
+бүйт
+біреу
+кейбіреу
+кейбір
+қайсыбір
+әрбір
+бірнеше
+бірдеме
+бірнеше
+әркім
+әрне
+әрқайсы
+әрқалай
+әлдекім
+әлдене
+әлдеқайдан
+әлденеше
+әлдеқалай
+әлдеқашан
+алдақашан
+еш
+ешкім
+ешбір
+ештеме
+дәнеңе
+ешқашан
+ешқандай
+ешқайсы
+емес
+бәрі
+барлық
+барша
+бар
+күллі
+бүкіл
+түгел
+өз
+өзім
+өзің
+өзінің
+өзіме
+өзіне
+өзімнің
+өзі
+өзге
+менде
+сенде
+онда
+менен
+сенен онан
+одан
+ау
+па
+ей
+әй
+е
+уа
+уау
+уай
+я
+пай
+ә
+о
+оһо
+ой
+ие
+аһа
+ау
+беу
+мәссаған
+бәрекелді
+әттегенай
+жаракімалла
+масқарай
+астапыралла
+япырмай
+ойпырмай
+кәне
+кәнеки
+ал
+әйда
+кәні
+міне
+әні
+сорап
+қош-қош
+пфша
+пішә
+құрау-құрау
+шәйт
+шек
+моһ
+тәк
+құрау
+құр
+кә
+кәһ
+күшім
+күшім
+мышы
+пырс
+әукім
+алақай
+паһ-паһ
+бәрекелді
+ура
+әттең
+әттеген-ай
+қап
+түге
+пішту
+шіркін
+алатау
+пай-пай
+үшін
+сайын
+сияқты
+туралы
+арқылы
+бойы
+бойымен
+шамалы
+шақты
+қаралы
+ғұрлы
+ғұрлым
+шейін
+дейін
+қарай
+таман
+салым
+тарта
+жуық
+таяу
+гөрі
+бері
+кейін
+соң
+бұрын
+бетер
+қатар
+бірге
+қоса
+арс
+гүрс
+дүрс
+қорс
+тарс
+тырс
+ырс
+барқ
+борт
+күрт
+кірт
+морт
+сарт
+шырт
+дүңк
+күңк
+қыңқ
+мыңқ
+маңқ
+саңқ
+шаңқ
+шіңк
+сыңқ
+таңқ
+тыңқ
+ыңқ
+болп
+былп
+жалп
+желп
+қолп
+ірк
+ырқ
+сарт-сұрт
+тарс-тұрс
+арс-ұрс
+жалт-жалт
+жалт-жұлт
+қалт-қалт
+қалт-құлт
+қаңқ-қаңқ
+қаңқ-құңқ
+шаңқ-шаңқ
+шаңқ-шұңқ
+арбаң-арбаң
+бүгжең-бүгжең
+арсалаң-арсалаң
+ербелең-ербелең
+батыр-бұтыр
+далаң-далаң
+тарбаң-тарбаң
+қызараң-қызараң
+қаңғыр-күңгір
+қайқаң-құйқаң
+митың-митың
+салаң-сұлаң
+ыржың-тыржың
+бірақ
+алайда
+дегенмен
+әйтпесе
+әйткенмен
+себебі
+өйткені
+сондықтан
+үшін
+сайын
+сияқты
+туралы
+арқылы
+бойы
+бойымен
+шамалы
+шақты
+қаралы
+ғұрлы
+ғұрлым
+гөрі
+бері
+кейін
+соң
+бұрын
+бетер
+қатар
+бірге
+қоса
+шейін
+дейін
+қарай
+таман
+салым
+тарта
+жуық
+таяу
+арнайы
+осындай
+ғана
+қана
+тек
+әншейін
+og
+i
+jeg
+det
+at
+en
+et
+den
+til
+er
+som
+på
+de
+med
+han
+av
+ikke
+ikkje
+der
+så
+var
+meg
+seg
+men
+ett
+har
+om
+vi
+min
+mitt
+ha
+hadde
+hun
+nå
+over
+da
+ved
+fra
+du
+ut
+sin
+dem
+oss
+opp
+man
+kan
+hans
+hvor
+eller
+hva
+skal
+selv
+sjøl
+her
+alle
+vil
+bli
+ble
+blei
+blitt
+kunne
+inn
+når
+være
+kom
+noen
+noe
+ville
+dere
+som
+deres
+kun
+ja
+etter
+ned
+skulle
+denne
+for
+deg
+si
+sine
+sitt
+mot
+å
+meget
+hvorfor
+dette
+disse
+uten
+hvordan
+ingen
+din
+ditt
+blir
+samme
+hvilken
+hvilke
+sånn
+inni
+mellom
+vår
+hver
+hvem
+vors
+hvis
+både
+bare
+enn
+fordi
+før
+mange
+også
+slik
+vært
+være
+båe
+begge
+siden
+dykk
+dykkar
+dei
+deira
+deires
+deim
+di
+då
+eg
+ein
+eit
+eitt
+elles
+honom
+hjå
+ho
+hoe
+henne
+hennar
+hennes
+hoss
+hossen
+ikkje
+ingi
+inkje
+korleis
+korso
+kva
+kvar
+kvarhelst
+kven
+kvi
+kvifor
+me
+medan
+mi
+mine
+mykje
+no
+nokon
+noka
+nokor
+noko
+nokre
+si
+sia
+sidan
+so
+somt
+somme
+um
+upp
+vere
+vore
+verte
+vort
+varte
+vart
+de
+a
+o
+que
+e
+do
+da
+em
+um
+para
+com
+não
+uma
+os
+no
+se
+na
+por
+mais
+as
+dos
+como
+mas
+ao
+ele
+das
+à
+seu
+sua
+ou
+quando
+muito
+nos
+já
+eu
+também
+só
+pelo
+pela
+até
+isso
+ela
+entre
+depois
+sem
+mesmo
+aos
+seus
+quem
+nas
+me
+esse
+eles
+você
+essa
+num
+nem
+suas
+meu
+às
+minha
+numa
+pelos
+elas
+qual
+nós
+lhe
+deles
+essas
+esses
+pelas
+este
+dele
+tu
+te
+vocês
+vos
+lhes
+meus
+minhas
+teu
+tua
+teus
+tuas
+nosso
+nossa
+nossos
+nossas
+dela
+delas
+esta
+estes
+estas
+aquele
+aquela
+aqueles
+aquelas
+isto
+aquilo
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam
+и
+в
+во
+не
+что
+он
+на
+я
+с
+со
+как
+а
+то
+все
+она
+так
+его
+но
+да
+ты
+к
+у
+же
+вы
+за
+бы
+по
+только
+ее
+мне
+было
+вот
+от
+меня
+еще
+нет
+о
+из
+ему
+теперь
+когда
+даже
+ну
+вдруг
+ли
+если
+уже
+или
+ни
+быть
+был
+него
+до
+вас
+нибудь
+опять
+уж
+вам
+ведь
+там
+потом
+себя
+ничего
+ей
+может
+они
+тут
+где
+есть
+надо
+ней
+для
+мы
+тебя
+их
+чем
+была
+сам
+чтоб
+без
+будто
+чего
+раз
+тоже
+себе
+под
+будет
+ж
+тогда
+кто
+этот
+того
+потому
+этого
+какой
+совсем
+ним
+здесь
+этом
+один
+почти
+мой
+тем
+чтобы
+нее
+сейчас
+были
+куда
+зачем
+всех
+никогда
+можно
+при
+наконец
+два
+об
+другой
+хоть
+после
+над
+больше
+тот
+через
+эти
+нас
+про
+всего
+них
+какая
+много
+разве
+три
+эту
+моя
+впрочем
+хорошо
+свою
+этой
+перед
+иногда
+лучше
+чуть
+том
+нельзя
+такой
+им
+более
+всегда
+конечно
+всю
+между
+de
+la
+que
+el
+en
+y
+a
+los
+del
+se
+las
+por
+un
+para
+con
+no
+una
+su
+al
+lo
+como
+más
+pero
+sus
+le
+ya
+o
+este
+sí
+porque
+esta
+entre
+cuando
+muy
+sin
+sobre
+también
+me
+hasta
+hay
+donde
+quien
+desde
+todo
+nos
+durante
+todos
+uno
+les
+ni
+contra
+otros
+ese
+eso
+ante
+ellos
+e
+esto
+mí
+antes
+algunos
+qué
+unos
+yo
+otro
+otras
+otra
+él
+tanto
+esa
+estos
+mucho
+quienes
+nada
+muchos
+cual
+poco
+ella
+estar
+estas
+algunas
+algo
+nosotros
+mi
+mis
+tú
+te
+ti
+tu
+tus
+ellas
+nosotras
+vosostros
+vosostras
+os
+mío
+mía
+míos
+mías
+tuyo
+tuya
+tuyos
+tuyas
+suyo
+suya
+suyos
+suyas
+nuestro
+nuestra
+nuestros
+nuestras
+vuestro
+vuestra
+vuestros
+vuestras
+esos
+esas
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+sintiendo
+sentido
+sentida
+sentidos
+sentidas
+siente
+sentid
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened
+och
+det
+att
+i
+en
+jag
+hon
+som
+han
+på
+den
+med
+var
+sig
+för
+så
+till
+är
+men
+ett
+om
+hade
+de
+av
+icke
+mig
+du
+henne
+då
+sin
+nu
+har
+inte
+hans
+honom
+skulle
+hennes
+där
+min
+man
+ej
+vid
+kunde
+något
+från
+ut
+när
+efter
+upp
+vi
+dem
+vara
+vad
+över
+än
+dig
+kan
+sina
+här
+ha
+mot
+alla
+under
+någon
+eller
+allt
+mycket
+sedan
+ju
+denna
+själv
+detta
+åt
+utan
+varit
+hur
+ingen
+mitt
+ni
+bli
+blev
+oss
+din
+dessa
+några
+deras
+blir
+mina
+samma
+vilken
+er
+sådan
+vår
+blivit
+dess
+inom
+mellan
+sådant
+varför
+varje
+vilka
+ditt
+vem
+vilket
+sitta
+sådana
+vart
+dina
+vars
+vårt
+våra
+ert
+era
+vilkas
+acaba
+ama
+aslında
+az
+bazı
+belki
+biri
+birkaç
+birşey
+biz
+bu
+çok
+çünkü
+da
+daha
+de
+defa
+diye
+eğer
+en
+gibi
+hem
+hep
+hepsi
+her
+hiç
+için
+ile
+ise
+kez
+ki
+kim
+mı
+mu
+mü
+nasıl
+ne
+neden
+nerde
+nerede
+nereye
+niçin
+niye
+o
+sanki
+şey
+siz
+şu
+tüm
+ve
+veya
+ya
+yani
diff --git a/src/shorttext/utils/stopwords.txt b/src/shorttext/utils/stopwords.txt
new file mode 100644
index 00000000..4b21343d
--- /dev/null
+++ b/src/shorttext/utils/stopwords.txt
@@ -0,0 +1,2781 @@
+og
+i
+jeg
+det
+at
+en
+den
+til
+er
+som
+på
+de
+med
+han
+af
+for
+ikke
+der
+var
+mig
+sig
+men
+et
+har
+om
+vi
+min
+havde
+ham
+hun
+nu
+over
+da
+fra
+du
+ud
+sin
+dem
+os
+op
+man
+hans
+hvor
+eller
+hvad
+skal
+selv
+her
+alle
+vil
+blev
+kunne
+ind
+når
+være
+dog
+noget
+ville
+jo
+deres
+efter
+ned
+skulle
+denne
+end
+dette
+mit
+også
+under
+have
+dig
+anden
+hende
+mine
+alt
+meget
+sit
+sine
+vor
+mod
+disse
+hvis
+din
+nogle
+hos
+blive
+mange
+ad
+bliver
+hendes
+været
+thi
+jer
+sådan
+de
+en
+van
+ik
+te
+dat
+die
+in
+een
+hij
+het
+niet
+zijn
+is
+was
+op
+aan
+met
+als
+voor
+had
+er
+maar
+om
+hem
+dan
+zou
+of
+wat
+mijn
+men
+dit
+zo
+door
+over
+ze
+zich
+bij
+ook
+tot
+je
+mij
+uit
+der
+daar
+haar
+naar
+heb
+hoe
+heeft
+hebben
+deze
+u
+want
+nog
+zal
+me
+zij
+nu
+ge
+geen
+omdat
+iets
+worden
+toch
+al
+waren
+veel
+meer
+doen
+toen
+moet
+ben
+zonder
+kan
+hun
+dus
+alles
+onder
+ja
+eens
+hier
+wie
+werd
+altijd
+doch
+wordt
+wezen
+kunnen
+ons
+zelf
+tegen
+na
+reeds
+wil
+kon
+niets
+uw
+iemand
+geweest
+andere
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+d
+ll
+m
+o
+re
+ve
+y
+ain
+aren
+couldn
+didn
+doesn
+hadn
+hasn
+haven
+isn
+ma
+mightn
+mustn
+needn
+shan
+shouldn
+wasn
+weren
+won
+wouldn
+olla
+olen
+olet
+on
+olemme
+olette
+ovat
+ole
+oli
+olisi
+olisit
+olisin
+olisimme
+olisitte
+olisivat
+olit
+olin
+olimme
+olitte
+olivat
+ollut
+olleet
+en
+et
+ei
+emme
+ette
+eivät
+minä
+minun
+minut
+minua
+minussa
+minusta
+minuun
+minulla
+minulta
+minulle
+sinä
+sinun
+sinut
+sinua
+sinussa
+sinusta
+sinuun
+sinulla
+sinulta
+sinulle
+hän
+hänen
+hänet
+häntä
+hänessä
+hänestä
+häneen
+hänellä
+häneltä
+hänelle
+me
+meidän
+meidät
+meitä
+meissä
+meistä
+meihin
+meillä
+meiltä
+meille
+te
+teidän
+teidät
+teitä
+teissä
+teistä
+teihin
+teillä
+teiltä
+teille
+he
+heidän
+heidät
+heitä
+heissä
+heistä
+heihin
+heillä
+heiltä
+heille
+tämä
+tämän
+tätä
+tässä
+tästä
+tähän
+tallä
+tältä
+tälle
+tänä
+täksi
+tuo
+tuon
+tuotä
+tuossa
+tuosta
+tuohon
+tuolla
+tuolta
+tuolle
+tuona
+tuoksi
+se
+sen
+sitä
+siinä
+siitä
+siihen
+sillä
+siltä
+sille
+sinä
+siksi
+nämä
+näiden
+näitä
+näissä
+näistä
+näihin
+näillä
+näiltä
+näille
+näinä
+näiksi
+nuo
+noiden
+noita
+noissa
+noista
+noihin
+noilla
+noilta
+noille
+noina
+noiksi
+ne
+niiden
+niitä
+niissä
+niistä
+niihin
+niillä
+niiltä
+niille
+niinä
+niiksi
+kuka
+kenen
+kenet
+ketä
+kenessä
+kenestä
+keneen
+kenellä
+keneltä
+kenelle
+kenenä
+keneksi
+ketkä
+keiden
+ketkä
+keitä
+keissä
+keistä
+keihin
+keillä
+keiltä
+keille
+keinä
+keiksi
+mikä
+minkä
+minkä
+mitä
+missä
+mistä
+mihin
+millä
+miltä
+mille
+minä
+miksi
+mitkä
+joka
+jonka
+jota
+jossa
+josta
+johon
+jolla
+jolta
+jolle
+jona
+joksi
+jotka
+joiden
+joita
+joissa
+joista
+joihin
+joilla
+joilta
+joille
+joina
+joiksi
+että
+ja
+jos
+koska
+kuin
+mutta
+niin
+sekä
+sillä
+tai
+vaan
+vai
+vaikka
+kanssa
+mukaan
+noin
+poikki
+yli
+kun
+niin
+nyt
+itse
+au
+aux
+avec
+ce
+ces
+dans
+de
+des
+du
+elle
+en
+et
+eux
+il
+je
+la
+le
+leur
+lui
+ma
+mais
+me
+même
+mes
+moi
+mon
+ne
+nos
+notre
+nous
+on
+ou
+par
+pas
+pour
+qu
+que
+qui
+sa
+se
+ses
+son
+sur
+ta
+te
+tes
+toi
+ton
+tu
+un
+une
+vos
+votre
+vous
+c
+d
+j
+l
+à
+m
+n
+s
+t
+y
+été
+étée
+étées
+étés
+étant
+étante
+étants
+étantes
+suis
+es
+est
+sommes
+êtes
+sont
+serai
+seras
+sera
+serons
+serez
+seront
+serais
+serait
+serions
+seriez
+seraient
+étais
+était
+étions
+étiez
+étaient
+fus
+fut
+fûmes
+fûtes
+furent
+sois
+soit
+soyons
+soyez
+soient
+fusse
+fusses
+fût
+fussions
+fussiez
+fussent
+ayant
+ayante
+ayantes
+ayants
+eu
+eue
+eues
+eus
+ai
+as
+avons
+avez
+ont
+aurai
+auras
+aura
+aurons
+aurez
+auront
+aurais
+aurait
+aurions
+auriez
+auraient
+avais
+avait
+avions
+aviez
+avaient
+eut
+eûmes
+eûtes
+eurent
+aie
+aies
+ait
+ayons
+ayez
+aient
+eusse
+eusses
+eût
+eussions
+eussiez
+eussent
+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daß
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+für
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+können
+könnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+über
+um
+und
+uns
+unsere
+unserem
+unseren
+unser
+unseres
+unter
+viel
+vom
+von
+vor
+während
+war
+waren
+warst
+was
+weg
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+würde
+würden
+zu
+zum
+zur
+zwar
+zwischen
+a
+ahogy
+ahol
+aki
+akik
+akkor
+alatt
+által
+általában
+amely
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelynek
+ami
+amit
+amolyan
+amíg
+amikor
+át
+abban
+ahhoz
+annak
+arra
+arról
+az
+azok
+azon
+azt
+azzal
+azért
+aztán
+azután
+azonban
+bár
+be
+belül
+benne
+cikk
+cikkek
+cikkeket
+csak
+de
+e
+eddig
+egész
+egy
+egyes
+egyetlen
+egyéb
+egyik
+egyre
+ekkor
+el
+elég
+ellen
+elõ
+elõször
+elõtt
+elsõ
+én
+éppen
+ebben
+ehhez
+emilyen
+ennek
+erre
+ez
+ezt
+ezek
+ezen
+ezzel
+ezért
+és
+fel
+felé
+hanem
+hiszen
+hogy
+hogyan
+igen
+így
+illetve
+ill.
+ill
+ilyen
+ilyenkor
+ison
+ismét
+itt
+jó
+jól
+jobban
+kell
+kellett
+keresztül
+keressünk
+ki
+kívül
+között
+közül
+legalább
+lehet
+lehetett
+legyen
+lenne
+lenni
+lesz
+lett
+maga
+magát
+majd
+majd
+már
+más
+másik
+meg
+még
+mellett
+mert
+mely
+melyek
+mi
+mit
+míg
+miért
+milyen
+mikor
+minden
+mindent
+mindenki
+mindig
+mint
+mintha
+mivel
+most
+nagy
+nagyobb
+nagyon
+ne
+néha
+nekem
+neki
+nem
+néhány
+nélkül
+nincs
+olyan
+ott
+össze
+õ
+õk
+õket
+pedig
+persze
+rá
+s
+saját
+sem
+semmi
+sok
+sokat
+sokkal
+számára
+szemben
+szerint
+szinte
+talán
+tehát
+teljes
+tovább
+továbbá
+több
+úgy
+ugyanis
+új
+újabb
+újra
+után
+utána
+utolsó
+vagy
+vagyis
+valaki
+valami
+valamint
+való
+vagyok
+van
+vannak
+volt
+voltam
+voltak
+voltunk
+vissza
+vele
+viszont
+volna
+ad
+al
+allo
+ai
+agli
+all
+agl
+alla
+alle
+con
+col
+coi
+da
+dal
+dallo
+dai
+dagli
+dall
+dagl
+dalla
+dalle
+di
+del
+dello
+dei
+degli
+dell
+degl
+della
+delle
+in
+nel
+nello
+nei
+negli
+nell
+negl
+nella
+nelle
+su
+sul
+sullo
+sui
+sugli
+sull
+sugl
+sulla
+sulle
+per
+tra
+contro
+io
+tu
+lui
+lei
+noi
+voi
+loro
+mio
+mia
+miei
+mie
+tuo
+tua
+tuoi
+tue
+suo
+sua
+suoi
+sue
+nostro
+nostra
+nostri
+nostre
+vostro
+vostra
+vostri
+vostre
+mi
+ti
+ci
+vi
+lo
+la
+li
+le
+gli
+ne
+il
+un
+uno
+una
+ma
+ed
+se
+perché
+anche
+come
+dov
+dove
+che
+chi
+cui
+non
+più
+quale
+quanto
+quanti
+quanta
+quante
+quello
+quelli
+quella
+quelle
+questo
+questi
+questa
+queste
+si
+tutto
+tutti
+a
+c
+e
+i
+l
+o
+ho
+hai
+ha
+abbiamo
+avete
+hanno
+abbia
+abbiate
+abbiano
+avrò
+avrai
+avrà
+avremo
+avrete
+avranno
+avrei
+avresti
+avrebbe
+avremmo
+avreste
+avrebbero
+avevo
+avevi
+aveva
+avevamo
+avevate
+avevano
+ebbi
+avesti
+ebbe
+avemmo
+aveste
+ebbero
+avessi
+avesse
+avessimo
+avessero
+avendo
+avuto
+avuta
+avuti
+avute
+sono
+sei
+è
+siamo
+siete
+sia
+siate
+siano
+sarò
+sarai
+sarà
+saremo
+sarete
+saranno
+sarei
+saresti
+sarebbe
+saremmo
+sareste
+sarebbero
+ero
+eri
+era
+eravamo
+eravate
+erano
+fui
+fosti
+fu
+fummo
+foste
+furono
+fossi
+fosse
+fossimo
+fossero
+essendo
+faccio
+fai
+facciamo
+fanno
+faccia
+facciate
+facciano
+farò
+farai
+farà
+faremo
+farete
+faranno
+farei
+faresti
+farebbe
+faremmo
+fareste
+farebbero
+facevo
+facevi
+faceva
+facevamo
+facevate
+facevano
+feci
+facesti
+fece
+facemmo
+faceste
+fecero
+facessi
+facesse
+facessimo
+facessero
+facendo
+sto
+stai
+sta
+stiamo
+stanno
+stia
+stiate
+stiano
+starò
+starai
+starà
+staremo
+starete
+staranno
+starei
+staresti
+starebbe
+staremmo
+stareste
+starebbero
+stavo
+stavi
+stava
+stavamo
+stavate
+stavano
+stetti
+stesti
+stette
+stemmo
+steste
+stettero
+stessi
+stesse
+stessimo
+stessero
+stando
+ах
+ох
+эх
+ай
+эй
+ой
+тағы
+тағыда
+әрине
+жоқ
+сондай
+осындай
+осылай
+солай
+мұндай
+бұндай
+мен
+сен
+ол
+біз
+біздер
+олар
+сіз
+сіздер
+маған
+оған
+саған
+біздің
+сіздің
+оның
+бізге
+сізге
+оларға
+біздерге
+сіздерге
+оларға
+менімен
+сенімен
+онымен
+бізбен
+сізбен
+олармен
+біздермен
+сіздермен
+менің
+сенің
+біздің
+сіздің
+оның
+біздердің
+сіздердің
+олардың
+маған
+саған
+оған
+менен
+сенен
+одан
+бізден
+сізден
+олардан
+біздерден
+сіздерден
+олардан
+айтпақшы
+сонымен
+сондықтан
+бұл
+осы
+сол
+анау
+мынау
+сонау
+осынау
+ана
+мына
+сона
+әні
+міне
+өй
+үйт
+бүйт
+біреу
+кейбіреу
+кейбір
+қайсыбір
+әрбір
+бірнеше
+бірдеме
+бірнеше
+әркім
+әрне
+әрқайсы
+әрқалай
+әлдекім
+әлдене
+әлдеқайдан
+әлденеше
+әлдеқалай
+әлдеқашан
+алдақашан
+еш
+ешкім
+ешбір
+ештеме
+дәнеңе
+ешқашан
+ешқандай
+ешқайсы
+емес
+бәрі
+барлық
+барша
+бар
+күллі
+бүкіл
+түгел
+өз
+өзім
+өзің
+өзінің
+өзіме
+өзіне
+өзімнің
+өзі
+өзге
+менде
+сенде
+онда
+менен
+сенен онан
+одан
+ау
+па
+ей
+әй
+е
+уа
+уау
+уай
+я
+пай
+ә
+о
+оһо
+ой
+ие
+аһа
+ау
+беу
+мәссаған
+бәрекелді
+әттегенай
+жаракімалла
+масқарай
+астапыралла
+япырмай
+ойпырмай
+кәне
+кәнеки
+ал
+әйда
+кәні
+міне
+әні
+сорап
+қош-қош
+пфша
+пішә
+құрау-құрау
+шәйт
+шек
+моһ
+тәк
+құрау
+құр
+кә
+кәһ
+күшім
+күшім
+мышы
+пырс
+әукім
+алақай
+паһ-паһ
+бәрекелді
+ура
+әттең
+әттеген-ай
+қап
+түге
+пішту
+шіркін
+алатау
+пай-пай
+үшін
+сайын
+сияқты
+туралы
+арқылы
+бойы
+бойымен
+шамалы
+шақты
+қаралы
+ғұрлы
+ғұрлым
+шейін
+дейін
+қарай
+таман
+салым
+тарта
+жуық
+таяу
+гөрі
+бері
+кейін
+соң
+бұрын
+бетер
+қатар
+бірге
+қоса
+арс
+гүрс
+дүрс
+қорс
+тарс
+тырс
+ырс
+барқ
+борт
+күрт
+кірт
+морт
+сарт
+шырт
+дүңк
+күңк
+қыңқ
+мыңқ
+маңқ
+саңқ
+шаңқ
+шіңк
+сыңқ
+таңқ
+тыңқ
+ыңқ
+болп
+былп
+жалп
+желп
+қолп
+ірк
+ырқ
+сарт-сұрт
+тарс-тұрс
+арс-ұрс
+жалт-жалт
+жалт-жұлт
+қалт-қалт
+қалт-құлт
+қаңқ-қаңқ
+қаңқ-құңқ
+шаңқ-шаңқ
+шаңқ-шұңқ
+арбаң-арбаң
+бүгжең-бүгжең
+арсалаң-арсалаң
+ербелең-ербелең
+батыр-бұтыр
+далаң-далаң
+тарбаң-тарбаң
+қызараң-қызараң
+қаңғыр-күңгір
+қайқаң-құйқаң
+митың-митың
+салаң-сұлаң
+ыржың-тыржың
+бірақ
+алайда
+дегенмен
+әйтпесе
+әйткенмен
+себебі
+өйткені
+сондықтан
+үшін
+сайын
+сияқты
+туралы
+арқылы
+бойы
+бойымен
+шамалы
+шақты
+қаралы
+ғұрлы
+ғұрлым
+гөрі
+бері
+кейін
+соң
+бұрын
+бетер
+қатар
+бірге
+қоса
+шейін
+дейін
+қарай
+таман
+салым
+тарта
+жуық
+таяу
+арнайы
+осындай
+ғана
+қана
+тек
+әншейін
+og
+i
+jeg
+det
+at
+en
+et
+den
+til
+er
+som
+på
+de
+med
+han
+av
+ikke
+ikkje
+der
+så
+var
+meg
+seg
+men
+ett
+har
+om
+vi
+min
+mitt
+ha
+hadde
+hun
+nå
+over
+da
+ved
+fra
+du
+ut
+sin
+dem
+oss
+opp
+man
+kan
+hans
+hvor
+eller
+hva
+skal
+selv
+sjøl
+her
+alle
+vil
+bli
+ble
+blei
+blitt
+kunne
+inn
+når
+være
+kom
+noen
+noe
+ville
+dere
+som
+deres
+kun
+ja
+etter
+ned
+skulle
+denne
+for
+deg
+si
+sine
+sitt
+mot
+å
+meget
+hvorfor
+dette
+disse
+uten
+hvordan
+ingen
+din
+ditt
+blir
+samme
+hvilken
+hvilke
+sånn
+inni
+mellom
+vår
+hver
+hvem
+vors
+hvis
+både
+bare
+enn
+fordi
+før
+mange
+også
+slik
+vært
+være
+båe
+begge
+siden
+dykk
+dykkar
+dei
+deira
+deires
+deim
+di
+då
+eg
+ein
+eit
+eitt
+elles
+honom
+hjå
+ho
+hoe
+henne
+hennar
+hennes
+hoss
+hossen
+ikkje
+ingi
+inkje
+korleis
+korso
+kva
+kvar
+kvarhelst
+kven
+kvi
+kvifor
+me
+medan
+mi
+mine
+mykje
+no
+nokon
+noka
+nokor
+noko
+nokre
+si
+sia
+sidan
+so
+somt
+somme
+um
+upp
+vere
+vore
+verte
+vort
+varte
+vart
+de
+a
+o
+que
+e
+do
+da
+em
+um
+para
+com
+não
+uma
+os
+no
+se
+na
+por
+mais
+as
+dos
+como
+mas
+ao
+ele
+das
+à
+seu
+sua
+ou
+quando
+muito
+nos
+já
+eu
+também
+só
+pelo
+pela
+até
+isso
+ela
+entre
+depois
+sem
+mesmo
+aos
+seus
+quem
+nas
+me
+esse
+eles
+você
+essa
+num
+nem
+suas
+meu
+às
+minha
+numa
+pelos
+elas
+qual
+nós
+lhe
+deles
+essas
+esses
+pelas
+este
+dele
+tu
+te
+vocês
+vos
+lhes
+meus
+minhas
+teu
+tua
+teus
+tuas
+nosso
+nossa
+nossos
+nossas
+dela
+delas
+esta
+estes
+estas
+aquele
+aquela
+aqueles
+aquelas
+isto
+aquilo
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam
+и
+в
+во
+не
+что
+он
+на
+я
+с
+со
+как
+а
+то
+все
+она
+так
+его
+но
+да
+ты
+к
+у
+же
+вы
+за
+бы
+по
+только
+ее
+мне
+было
+вот
+от
+меня
+еще
+нет
+о
+из
+ему
+теперь
+когда
+даже
+ну
+вдруг
+ли
+если
+уже
+или
+ни
+быть
+был
+него
+до
+вас
+нибудь
+опять
+уж
+вам
+ведь
+там
+потом
+себя
+ничего
+ей
+может
+они
+тут
+где
+есть
+надо
+ней
+для
+мы
+тебя
+их
+чем
+была
+сам
+чтоб
+без
+будто
+чего
+раз
+тоже
+себе
+под
+будет
+ж
+тогда
+кто
+этот
+того
+потому
+этого
+какой
+совсем
+ним
+здесь
+этом
+один
+почти
+мой
+тем
+чтобы
+нее
+сейчас
+были
+куда
+зачем
+всех
+никогда
+можно
+при
+наконец
+два
+об
+другой
+хоть
+после
+над
+больше
+тот
+через
+эти
+нас
+про
+всего
+них
+какая
+много
+разве
+три
+эту
+моя
+впрочем
+хорошо
+свою
+этой
+перед
+иногда
+лучше
+чуть
+том
+нельзя
+такой
+им
+более
+всегда
+конечно
+всю
+между
+de
+la
+que
+el
+en
+y
+a
+los
+del
+se
+las
+por
+un
+para
+con
+no
+una
+su
+al
+lo
+como
+más
+pero
+sus
+le
+ya
+o
+este
+sí
+porque
+esta
+entre
+cuando
+muy
+sin
+sobre
+también
+me
+hasta
+hay
+donde
+quien
+desde
+todo
+nos
+durante
+todos
+uno
+les
+ni
+contra
+otros
+ese
+eso
+ante
+ellos
+e
+esto
+mí
+antes
+algunos
+qué
+unos
+yo
+otro
+otras
+otra
+él
+tanto
+esa
+estos
+mucho
+quienes
+nada
+muchos
+cual
+poco
+ella
+estar
+estas
+algunas
+algo
+nosotros
+mi
+mis
+tú
+te
+ti
+tu
+tus
+ellas
+nosotras
+vosostros
+vosostras
+os
+mío
+mía
+míos
+mías
+tuyo
+tuya
+tuyos
+tuyas
+suyo
+suya
+suyos
+suyas
+nuestro
+nuestra
+nuestros
+nuestras
+vuestro
+vuestra
+vuestros
+vuestras
+esos
+esas
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+sintiendo
+sentido
+sentida
+sentidos
+sentidas
+siente
+sentid
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened
+och
+det
+att
+i
+en
+jag
+hon
+som
+han
+på
+den
+med
+var
+sig
+för
+så
+till
+är
+men
+ett
+om
+hade
+de
+av
+icke
+mig
+du
+henne
+då
+sin
+nu
+har
+inte
+hans
+honom
+skulle
+hennes
+där
+min
+man
+ej
+vid
+kunde
+något
+från
+ut
+när
+efter
+upp
+vi
+dem
+vara
+vad
+över
+än
+dig
+kan
+sina
+här
+ha
+mot
+alla
+under
+någon
+eller
+allt
+mycket
+sedan
+ju
+denna
+själv
+detta
+åt
+utan
+varit
+hur
+ingen
+mitt
+ni
+bli
+blev
+oss
+din
+dessa
+några
+deras
+blir
+mina
+samma
+vilken
+er
+sådan
+vår
+blivit
+dess
+inom
+mellan
+sådant
+varför
+varje
+vilka
+ditt
+vem
+vilket
+sitta
+sådana
+vart
+dina
+vars
+vårt
+våra
+ert
+era
+vilkas
+acaba
+ama
+aslında
+az
+bazı
+belki
+biri
+birkaç
+birşey
+biz
+bu
+çok
+çünkü
+da
+daha
+de
+defa
+diye
+eğer
+en
+gibi
+hem
+hep
+hepsi
+her
+hiç
+için
+ile
+ise
+kez
+ki
+kim
+mı
+mu
+mü
+nasıl
+ne
+neden
+nerde
+nerede
+nereye
+niçin
+niye
+o
+sanki
+şey
+siz
+şu
+tüm
+ve
+veya
+ya
+yani
diff --git a/src/shorttext/utils/textpreprocessing.py b/src/shorttext/utils/textpreprocessing.py
new file mode 100644
index 00000000..910ac0ef
--- /dev/null
+++ b/src/shorttext/utils/textpreprocessing.py
@@ -0,0 +1,185 @@
+
+import re
+import os
+import codecs
+from io import TextIOWrapper
+from types import FunctionType
+from functools import partial
+
+import snowballstemmer
+
+
+# tokenizer
+def tokenize(s: str) -> list[str]:
+ return s.split(' ')
+
+
+# stemmer
+class StemmerSingleton:
+ def __new__(cls):
+ if not hasattr(cls, 'instance'):
+ cls.instance = super(StemmerSingleton, cls).__new__(cls)
+ cls.stemmer = snowballstemmer.stemmer('english')
+ return cls.instance
+
+ def __call__(cls, s: str) -> str:
+ return cls.stemmer.stemWord(s)
+
+def stemword(s: str) -> str:
+ return StemmerSingleton()(s)
+
+
+def preprocess_text(text: str, pipeline: list[FunctionType]) -> str:
+ """ Preprocess the text according to the given pipeline.
+
+ Given the pipeline, which is a list of functions that process an
+ input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
+ preprocess the text.
+
+ :param text: text to be preprocessed
+ :param pipeline: a list of functions that convert a text to another text
+ :return: preprocessed text
+ :type text: str
+ :type pipeline: list
+ :rtype: str
+ """
+ return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
+
+
+def tokenize_text(
+ text: str,
+ presplit_pipeline: list[FunctionType],
+ primitize_tokenizer: FunctionType,
+ prosplit_pipeline: list[FunctionType],
+ stopwordsfile: TextIOWrapper
+) -> list[str]:
+ # load stop words file
+ stopwordset = set([stopword.strip() for stopword in stopwordsfile])
+
+ # done
+ presplit_text = text
+ for func in presplit_pipeline:
+ presplit_text = func(presplit_text)
+ postsplit_tokens = primitize_tokenizer(presplit_text)
+ for func in prosplit_pipeline:
+ for i, token in enumerate(postsplit_tokens):
+ postsplit_tokens[i] = func(token)
+ postsplit_tokens = [
+ token for token in postsplit_tokens
+ if token not in stopwordset
+ ]
+ return postsplit_tokens
+
+
+def text_preprocessor(pipeline: list[FunctionType]) -> FunctionType:
+ """ Return the function that preprocesses text according to the pipeline.
+
+ Given the pipeline, which is a list of functions that process an
+ input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.),
+ return a function that preprocesses an input text outlined by the pipeline, essentially
+ a function that runs :func:`~preprocess_text` with the specified pipeline.
+
+ :param pipeline: a list of functions that convert a text to another text
+ :return: a function that preprocesses text according to the pipeline
+ :type pipeline: list
+ :rtype: function
+ """
+ return partial(preprocess_text, pipeline=pipeline)
+
+
+def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> FunctionType:
+ """ Return a commonly used text preprocessor.
+
+ Return a text preprocessor that is commonly used, with the following steps:
+
+ - removing special characters,
+ - removing numerals,
+ - converting all alphabets to lower cases,
+ - removing stop words, and
+ - stemming the words (using Porter stemmer).
+
+ This function calls :func:`~text_preprocessor`.
+
+ :param stopwordsfile: file object of the list of stop words
+ :type stopwordsfile: file
+ :return: a function that preprocesses text according to the pipeline
+ :rtype: function
+ """
+ # load stop words file
+ stopwordset = set([stopword.strip() for stopword in stopwordsfile])
+ stopwordsfile.close()
+
+ # the pipeline
+ pipeline = [lambda s: re.sub('[^\w\s]', '', s),
+ lambda s: re.sub('[\d]', '', s),
+ lambda s: s.lower(),
+ lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))),
+ lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
+ ]
+ return text_preprocessor(pipeline)
+
+
+def standard_text_preprocessor_1() -> FunctionType:
+ """ Return a commonly used text preprocessor.
+
+ Return a text preprocessor that is commonly used, with the following steps:
+
+ - removing special characters,
+ - removing numerals,
+ - converting all alphabets to lower cases,
+ - removing stop words (NLTK list), and
+ - stemming the words (using Porter stemmer).
+
+ This function calls :func:`~oldschool_standard_text_preprocessor`.
+
+ :return: a function that preprocesses text according to the pipeline
+ :rtype: function
+ """
+ # load stop words
+ this_dir, _ = os.path.split(__file__)
+ stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8')
+
+ return oldschool_standard_text_preprocessor(stopwordsfile)
+
+
+def standard_text_preprocessor_2() -> FunctionType:
+ """ Return a commonly used text preprocessor.
+
+ Return a text preprocessor that is commonly used, with the following steps:
+
+ - removing special characters,
+ - removing numerals,
+ - converting all alphabets to lower cases,
+ - removing stop words (NLTK list minus negation terms), and
+ - stemming the words (using Porter stemmer).
+
+ This function calls :func:`~oldschool_standard_text_preprocessor`.
+
+ :return: a function that preprocesses text according to the pipeline
+ :rtype: function
+ """
+ # load stop words
+ this_dir, _ = os.path.split(__file__)
+ stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
+
+ return oldschool_standard_text_preprocessor(stopwordsfile)
+
+
+def advanced_text_tokenizer_1() -> FunctionType:
+ presplit_pipeline = [
+ lambda s: re.sub('[^\w\s]', '', s),
+ lambda s: re.sub('[\d]', '', s),
+ lambda s: s.lower()
+ ]
+ tokenizer = tokenize
+ postsplit_pipeline = [
+ lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
+ ]
+ this_dir, _ = os.path.split(__file__)
+ return partial(
+ tokenize_text,
+ presplit_pipeline=presplit_pipeline,
+ tokenizer=tokenizer,
+ postsplit_pipeline=postsplit_pipeline,
+ stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
+ )
diff --git a/src/shorttext/utils/wordembed.py b/src/shorttext/utils/wordembed.py
new file mode 100644
index 00000000..7e153431
--- /dev/null
+++ b/src/shorttext/utils/wordembed.py
@@ -0,0 +1,222 @@
+
+import numpy as np
+import gensim
+from gensim.models import KeyedVectors
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.poincare import PoincareModel, PoincareKeyedVectors
+import requests
+
+from .textpreprocessing import tokenize
+
+
+def load_word2vec_model(path, binary=True):
+ """ Load a pre-trained Word2Vec model.
+
+ :param path: path of the file of the pre-trained Word2Vec model
+ :param binary: whether the file is in binary format (Default: True)
+ :return: a pre-trained Word2Vec model
+ :type path: str
+ :type binary: bool
+ :rtype: gensim.models.keyedvectors.KeyedVectors
+ """
+ return KeyedVectors.load_word2vec_format(path, binary=binary)
+
+
+def load_fasttext_model(path, encoding='utf-8'):
+ """ Load a pre-trained FastText model.
+
+ :param path: path of the file of the pre-trained FastText model
+ :return: a pre-trained FastText model
+ :type path: str
+ :rtype: gensim.models.keyedvectors.FastTextKeyedVectors
+ """
+ return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding)
+
+
+def load_poincare_model(path, word2vec_format=True, binary=False):
+ """ Load a Poincare embedding model.
+
+ :param path: path of the file of the pre-trained Poincare embedding model
+ :param word2vec_format: whether to load from word2vec format (default: True)
+ :param binary: binary format (default: False)
+ :return: a pre-trained Poincare embedding model
+ :type path: str
+ :type word2vec_format: bool
+ :type binary: bool
+ :rtype: gensim.models.poincare.PoincareKeyedVectors
+ """
+ if word2vec_format:
+ return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
+ else:
+ return PoincareModel.load(path).kv
+
+
+def shorttext_to_avgvec(shorttext, wvmodel):
+ """ Convert the short text into an averaged embedded vector representation.
+
+ Given a short sentence, it converts all the tokens into embedded vectors according to
+ the given word-embedding model, sums
+ them up, and normalize the resulting vector. It returns the resulting vector
+ that represents this short sentence.
+
+ :param shorttext: a short sentence
+ :param wvmodel: word-embedding model
+ :return: an embedded vector that represents the short sentence
+ :type shorttext: str
+ :type wvmodel: gensim.models.keyedvectors.KeyedVectors
+ :rtype: numpy.ndarray
+ """
+ vec = np.sum(
+ [
+ wvmodel[token]
+ if token in wvmodel
+ else np.array([1.]*wvmodel.vector_size) / np.sqrt(wvmodel.vector_size)
+ for token in tokenize(shorttext)
+ ],
+ axis=0
+ )
+
+ # normalize
+ norm = np.linalg.norm(vec)
+ if norm != 0:
+ vec /= norm
+
+ return vec
+
+
+class RESTfulKeyedVectors(KeyedVectors):
+ """ RESTfulKeyedVectors, for connecting to the API of the preloaded word-embedding vectors loaded
+ by `WordEmbedAPI`.
+
+ This class inherits from :class:`gensim.models.keyedvectors.KeyedVectors`.
+
+ """
+ def __init__(self, url, port='5000'):
+ """ Initialize the class.
+
+ :param url: URL of the API, usually `http://localhost`
+ :param port: Port number
+ :type url: str
+ :type port: str
+ """
+ self.url = url
+ self.port = port
+
+ def closer_than(self, entity1, entity2):
+ """
+
+ :param entity1: word 1
+ :param entity2: word 2
+ :type entity1: str
+ :type entity2: str
+ :return: list of words
+ :rtype: list
+ """
+ r = requests.post(self.url + ':' + self.port + '/closerthan',
+ json={'entity1': entity1, 'entity2': entity2})
+ return r.json()
+
+ def distance(self, entity1, entity2):
+ """
+
+ :param entity1: word 1
+ :param entity2: word 2
+ :type entity1: str
+ :type entity2: str
+ :return: distance between two words
+ :rtype: float
+ """
+ r = requests.post(self.url + ':' + self.port + '/distance',
+ json={'entity1': entity1, 'entity2': entity2})
+ return r.json()['distance']
+
+ def distances(self, entity1, other_entities=()):
+ """
+
+ :param entity1: word
+ :param other_entities: list of words
+ :type entity1: str
+ :type other_entities: list
+ :return: list of distances between `entity1` and each word in `other_entities`
+ :rtype: list
+ """
+ r = requests.post(self.url + ':' + self.port + '/distances',
+ json={'entity1': entity1, 'other_entities': other_entities})
+ return np.array(r.json()['distances'], dtype=np.float32)
+
+ def get_vector(self, entity):
+ """
+
+ :param entity: word
+ :type: str
+ :return: word vectors of the given word
+ :rtype: numpy.ndarray
+ """
+ r = requests.post(self.url + ':' + self.port + '/get_vector', json={'token': entity})
+ returned_dict = r.json()
+ if 'vector' in returned_dict:
+ return np.array(returned_dict['vector'])
+ else:
+ raise KeyError('The token {} does not exist in the model.'.format(entity))
+
+ def most_similar(self, **kwargs):
+ """
+
+ :param kwargs:
+ :return:
+ """
+ r = requests.post(self.url + ':' + self.port + '/most_similar', json=kwargs)
+ return [tuple(pair) for pair in r.json()]
+
+ def most_similar_to_given(self, entity1, entities_list):
+ """
+
+ :param entity1: word
+ :param entities_list: list of words
+ :type entity1: str
+ :type entities_list: list
+ :return: list of similarities between the given word and each word in `entities_list`
+ :rtype: list
+ """
+ r = requests.post(self.url + ':' + self.port + '/most_similar_to_given',
+ json={'entity1': entity1, 'entities_list': entities_list})
+ return r.json()['token']
+
+ def rank(self, entity1, entity2):
+ """
+
+ :param entity1: word 1
+ :param entity2: word 2
+ :type entity1: str
+ :type entity2: str
+ :return: rank
+ :rtype: int
+ """
+ r = requests.post(self.url + ':' + self.port + '/rank',
+ json={'entity1': entity1, 'entity2': entity2})
+ return r.json()['rank']
+
+ def save(self, fname_or_handle, **kwargs):
+ """
+
+ :param fname_or_handle:
+ :param kwargs:
+ :return:
+ """
+ raise IOError('The class RESTfulKeyedVectors do not persist models to a file.')
+
+ def similarity(self, entity1, entity2):
+ """
+
+ :param entity1: word 1
+ :param entity2: word 2
+ :return: similarity between two words
+ :type entity1: str
+ :type entity2: str
+ :rtype: float
+ """
+ r = requests.post(self.url + ':' + self.port + '/similarity',
+ json={'entity1': entity1, 'entity2': entity2})
+ return r.json()['similarity']
+
+# reference: https://radimrehurek.com/gensim/models/keyedvectors.html
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 00000000..828a6894
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1,3 @@
+"""
+This package has automated unit-tests for shorttext.
+"""
diff --git a/test/test_charonehot.py b/test/test_charonehot.py
new file mode 100644
index 00000000..c5f89aa4
--- /dev/null
+++ b/test/test_charonehot.py
@@ -0,0 +1,19 @@
+
+import unittest
+from urllib.request import urlopen
+
+import shorttext
+
+
+class TestCharOneHot(unittest.TestCase):
+ def test_BigTxt(self):
+ chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(
+ urlopen('http://norvig.com/big.txt'),
+ encoding='utf-8'
+ )
+ self.assertEqual(93, len(chartovec_encoder.dictionary))
+ self.assertEqual('\n', chartovec_encoder.signalchar)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_dtm.py b/test/test_dtm.py
new file mode 100644
index 00000000..0da2780f
--- /dev/null
+++ b/test/test_dtm.py
@@ -0,0 +1,46 @@
+
+import unittest
+import re
+
+import pandas as pd
+import shorttext
+from shorttext.utils import stemword, tokenize
+
+
+class TestDTM(unittest.TestCase):
+ def test_inaugural(self):
+ # preparing data
+ usprez = shorttext.data.inaugural()
+ docids = sorted(usprez.keys())
+ usprez = [' '.join(usprez[docid]) for docid in docids]
+ usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez})
+ usprezdf = usprezdf[['yrprez', 'speech']]
+
+ # preprocesser defined
+ pipeline = [lambda s: re.sub('[^\w\s]', '', s),
+ lambda s: re.sub('[\d]', '', s),
+ lambda s: s.lower(),
+ lambda s: ' '.join([stemword(token) for token in tokenize(s)])
+ ]
+ txtpreprocessor = shorttext.utils.text_preprocessor(pipeline)
+
+ # corpus making
+ docids = list(usprezdf['yrprez'])
+ corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']]
+
+ # making DTM
+ dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
+
+ # check results
+ self.assertEqual(len(dtm.dictionary), 5256)
+ self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138,
+ places=3)
+ numdocs, numtokens = dtm.dtm.shape
+ self.assertEqual(numdocs, 56)
+ self.assertEqual(numtokens, 5256)
+ self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407,
+ places=3)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_fuzzylogic.py b/test/test_fuzzylogic.py
new file mode 100644
index 00000000..450ea1b6
--- /dev/null
+++ b/test/test_fuzzylogic.py
@@ -0,0 +1,34 @@
+
+import unittest
+
+import shorttext
+
+
+class TestFuzzyLogic(unittest.TestCase):
+ def test_similarity(self):
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('debug', 'deubg'), 1)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('intrdependence', 'interdpeendencae'), 3)
+ self.assertEqual(shorttext.metrics.dynprog.lcp.longest_common_prefix('debug', 'debuag'), 4)
+
+ def test_transposition(self):
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('independent', 'indeepndent'), 1)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('providence', 'porvidecne'), 2)
+
+ def test_insertion(self):
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorithms'), 1)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoarithmm'), 2)
+
+ def test_deletion(self):
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoithm'), 1)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorith'), 1)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algrihm'), 2)
+
+ def test_correct(self):
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('python', 'python'), 0)
+ self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('sosad', 'sosad'), 0)
+
+ def test_jaccard(self):
+ self.assertAlmostEqual(shorttext.metrics.dynprog.jaccard.similarity('diver', 'driver'), 5./6.)
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/test/test_norvigspell.py b/test/test_norvigspell.py
new file mode 100644
index 00000000..b682be40
--- /dev/null
+++ b/test/test_norvigspell.py
@@ -0,0 +1,21 @@
+
+import unittest
+from urllib.request import urlopen
+
+import shorttext
+
+
+class TestSpellCheck(unittest.TestCase):
+ def setUp(self):
+ self.text = urlopen('http://norvig.com/big.txt').read()
+ self.text = self.text.decode('utf-8')
+
+ def test_norvig(self):
+ speller = shorttext.spell.NorvigSpellCorrector()
+ speller.train(self.text)
+ self.assertEqual(speller.correct('apple'), 'apple')
+ self.assertEqual(speller.correct('appl'), 'apply')
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_stacking.py b/test/test_stacking.py
new file mode 100644
index 00000000..96872e2b
--- /dev/null
+++ b/test/test_stacking.py
@@ -0,0 +1,131 @@
+
+import unittest
+import os
+
+import shorttext
+from shorttext.stack import LogisticStackedGeneralization
+from shorttext.smartload import smartload_compact_model
+from sklearn.svm import SVC
+
+
+class TestStacking(unittest.TestCase):
+ def setUp(self):
+ self.nihdict = shorttext.data.nihreports(sample_size=None)
+
+ def tearDown(self):
+ for filepath in os.listdir('.'):
+ if filepath.endswith('.bin'):
+ os.remove(os.path.join('.', filepath))
+
+ def training_stacking(self):
+ # loading NIH Reports
+ nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']}
+
+ # maxent
+ maxent_classifier = shorttext.classifiers.MaxEntClassifier()
+ maxent_classifier.train(nihdict, nb_epochs=100)
+ maxent_classifier.save_compact_model('./bio_maxent.bin')
+
+ # SVM + LDA
+ topicmodeler = shorttext.generators.LDAModeler()
+ topicmodeler.train(nihdict, 8)
+ topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler)
+ topicmodeler.save_compact_model('./bio_lda.bin')
+ svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC())
+ svm_classifier.train(nihdict)
+ svm_classifier.save_compact_model('./bio_svm.bin')
+
+ # logistic
+ stacked_classifier = LogisticStackedGeneralization({'maxent': maxent_classifier,
+ 'svm': svm_classifier,
+ 'topiccosine': topicdisclassifier})
+ stacked_classifier.train(nihdict)
+ stacked_classifier.save_compact_model('./bio_logistics.bin')
+
+ return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier
+
+ def comparedict(self, dict1, dict2):
+ self.assertTrue(len(dict1)==len(dict2))
+ print(dict1, dict2)
+ for classlabel in dict1:
+ self.assertTrue(classlabel in dict2)
+ self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4)
+
+ def testStudies(self):
+ # train
+ maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = self.training_stacking()
+ topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler)
+
+ # smartload
+ maxent_classifier2 = smartload_compact_model('./bio_maxent.bin', None)
+ topicmodeler2 = smartload_compact_model('./bio_lda.bin', None)
+ topicdisclassifier2 = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler2)
+ svm_classifier2 = smartload_compact_model('./bio_svm.bin', None)
+ stacked_classifier2 = LogisticStackedGeneralization({'maxent': maxent_classifier2,
+ 'svm': svm_classifier2,
+ 'topiccosine': topicdisclassifier2})
+ stacked_classifier2.load_compact_model('./bio_logistics.bin')
+
+ # compare
+ terms = ['stem cell', 'grant', 'system biology']
+ for term in terms:
+ print(term)
+ print('maximum entropy')
+ self.comparedict(maxent_classifier.score(term), maxent_classifier2.score(term))
+ print('LDA')
+ self.comparedict(topicdisclassifier.score(term), topicdisclassifier2.score(term))
+ print('SVM')
+ self.comparedict(svm_classifier.score(term), svm_classifier2.score(term))
+ print('combined')
+ self.comparedict(stacked_classifier.score(term), stacked_classifier2.score(term))
+
+ def testSVM(self):
+ # loading NIH Reports
+ nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']}
+
+ # svm
+ topicmodeler = shorttext.generators.LDAModeler()
+ topicmodeler.train(nihdict, 16)
+ svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC())
+ svm_classifier.train(nihdict)
+ print('before saving...')
+ print('--'.join(svm_classifier.classlabels))
+ print('--'.join(svm_classifier.topicmodeler.classlabels))
+ svm_classifier.save_compact_model('./bio_svm2.bin')
+ print('after saving...')
+ print('--'.join(svm_classifier.classlabels))
+ print('--'.join(svm_classifier.topicmodeler.classlabels))
+
+ # load
+ svm_classifier2 = smartload_compact_model('./bio_svm2.bin', None)
+ print('second classifier...')
+ print(','.join(svm_classifier2.classlabels))
+ print(','.join(svm_classifier2.topicmodeler.classlabels))
+
+ # compare
+ terms = ['stem cell', 'grant', 'system biology']
+ for term in terms:
+ print(term)
+ topicvec = svm_classifier.getvector(term)
+ topicvec2 = svm_classifier2.getvector(term)
+ print(topicvec)
+ print(topicvec2)
+ for idx, classlabel in enumerate(svm_classifier.classlabels):
+ print(str(idx)+' '+classlabel)
+ print(svm_classifier.classifier.score([topicvec], [idx]))
+ for idx, classlabel in enumerate(svm_classifier2.classlabels):
+ print(str(idx) + ' ' + classlabel)
+ print(svm_classifier2.classifier.score([topicvec2], [idx]))
+ print({classlabel: svm_classifier.classifier.score([topicvec], [idx])
+ for idx, classlabel in enumerate(svm_classifier.classlabels)})
+ print({classlabel: svm_classifier2.classifier.score([topicvec], [idx])
+ for idx, classlabel in enumerate(svm_classifier2.classlabels)})
+
+ for term in terms:
+ print(term)
+ self.comparedict(svm_classifier.score(term), svm_classifier2.score(term))
+
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/test/test_textpreprocessing.py b/test/test_textpreprocessing.py
new file mode 100644
index 00000000..4c836a05
--- /dev/null
+++ b/test/test_textpreprocessing.py
@@ -0,0 +1,21 @@
+
+import unittest
+
+import shorttext
+
+class TestTextPreprocessing(unittest.TestCase):
+ def testStandardPipeline(self):
+ preprocessor = shorttext.utils.standard_text_preprocessor_1()
+ self.assertEqual(preprocessor('I love you.'), 'love')
+ self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire')
+ self.assertEqual(preprocessor('I do not think.'), 'think')
+
+ def testStandPipelineDifferentStopwords(self):
+ preprocessor = shorttext.utils.standard_text_preprocessor_2()
+ self.assertEqual(preprocessor('I love you.'), 'love')
+ self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire')
+ self.assertEqual(preprocessor('I do not think.'), 'not think')
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py
new file mode 100644
index 00000000..3f88994d
--- /dev/null
+++ b/test/test_var_nn_embedded_vec_classifier.py
@@ -0,0 +1,104 @@
+
+import os
+import unittest
+import urllib
+
+import shorttext
+
+
+class TestVarNNEmbeddedVecClassifier(unittest.TestCase):
+ def setUp(self):
+ print("Downloading word-embedding model....")
+ link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
+ filename = "test_w2v_model.bin"
+ if not os.path.isfile("test_w2v_model.bin"):
+ urllib.request.urlretrieve(link, filename)
+ self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model
+ self.trainclass_dict = shorttext.data.subjectkeywords() # load training data
+
+ def tearDown(self):
+ print("Removing word-embedding model")
+ if os.path.isfile("test_w2v_model.bin"):
+ os.remove('test_w2v_model.bin')
+
+ def comparedict(self, dict1, dict2):
+ self.assertTrue(len(dict1)==len(dict2))
+ print(dict1, dict2)
+ for classlabel in dict1:
+ self.assertTrue(classlabel in dict2)
+ self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4)
+
+ def testCNNWordEmbedWithoutGensim(self):
+ print("Testing CNN...")
+ # create keras model using `CNNWordEmbed` class
+ print("\tKeras model")
+ keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model,
+ nb_labels=len(self.trainclass_dict.keys()))
+
+ # create and train classifier using keras model constructed above
+ print("\tTraining")
+ main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
+ main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
+
+ # compute classification score
+ print("\tTesting")
+ score_vals = main_classifier.score('artificial intelligence')
+ self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
+
+ def testDoubleCNNWordEmbedWithoutGensim(self):
+ print("Testing DoubleCNN...")
+ # create keras model using `DoubleCNNWordEmbed` class
+ print("\tKeras model")
+ keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model,
+ nb_labels=len(self.trainclass_dict.keys()))
+
+ # create and train classifier using keras model constructed above
+ print("\tTraining")
+ main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
+ main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
+
+ # compute classification score
+ print("\tTesting")
+ score_vals = main_classifier.score('artificial intelligence')
+ self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
+
+ def testCLSTMWordEmbedWithoutGensim(self):
+ print("Testing CLSTM...")
+ # create keras model using `CLSTMWordEmbed` class
+ print("\tKeras model")
+ keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model,
+ nb_labels=len(self.trainclass_dict.keys()))
+
+ # create and train classifier using keras model constructed above
+ print("\tTraining")
+ main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model)
+ main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)
+
+ # compute classification score
+ print("\tTesting")
+ score_vals = main_classifier.score('artificial intelligence')
+ self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1)
+
+ def testAASumEmbed(self):
+ print("Testing SumEmbed")
+ classifier = shorttext.classifiers.SumEmbeddedVecClassifier(self.w2v_model)
+ classdict = shorttext.data.subjectkeywords()
+ classifier.train(classdict)
+
+ # compute
+ self.comparedict(classifier.score('linear algebra'),
+ {'mathematics': 0.9044698253778962,
+ 'physics': 0.7586816549044926,
+ 'theology': 0.1817602793151848})
+ self.comparedict(classifier.score('learning'),
+ {'mathematics': 0.9037142562255835,
+ 'physics': 0.7588376500004107,
+ 'theology': 0.18039468994239538})
+ self.comparedict(classifier.score('eschatology'),
+ {'mathematics': 0.3658578123294476,
+ 'physics': 0.5996711864493821,
+ 'theology': 0.9694560847986978})
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_wmd.py b/test/test_wmd.py
new file mode 100644
index 00000000..1cdd6f78
--- /dev/null
+++ b/test/test_wmd.py
@@ -0,0 +1,40 @@
+import os
+import unittest
+import urllib
+
+from shorttext.metrics.wasserstein import word_mover_distance
+from shorttext.utils import load_word2vec_model
+
+
+class TestWMD(unittest.TestCase):
+ def setUp(self):
+ print("Downloading word-embedding model....")
+ link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
+ filename = "test_w2v_model.bin"
+ if not os.path.isfile("test_w2v_model.bin"):
+ urllib.request.urlretrieve(link, filename)
+ self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model
+
+ def tearDown(self):
+ print("Removing word-embedding model")
+ if os.path.isfile("test_w2v_model.bin"):
+ os.remove('test_w2v_model.bin')
+
+ def calculate_wmd(self, tokens1, tokens2, answer):
+ wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model)
+ self.assertAlmostEqual(wdistance, answer, delta=1e-3)
+
+ def test_metrics(self):
+ tokens1 = ['president', 'speaks']
+ tokens2 = ['president', 'talks']
+ known_answer = 0.19936788082122803
+ self.calculate_wmd(tokens1, tokens2, known_answer)
+
+ tokens1 = ['fan', 'book']
+ tokens2 = ['apple', 'orange']
+ known_answer = 1.8019972145557404
+ self.calculate_wmd(tokens1, tokens2, known_answer)
+
+
+if __name__ == '__main__':
+ unittest.main()