diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d58dcf --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.bundle +bayesdata.yml diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..ac2cdeb --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +2.1.3 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..27e53de --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Revisions +##0.5.0 +* Lots of refactoring, cleanup, making methods private, etc. +* Added tests +* Removed uid from train/untrain - couldn't think of a good use case, and the logic didn't seem right since the system doesn't keep track of which call to train created a token the untrain option would blindly remove them. +* Changed BayesData to BayesPool since that seems more explanatory +* Moved some pool manipulation functions into BayesPool for better encapsulation +* Add to_json method +* Removed data_class from Bayes initializer since I couldn't think of a reason to make that configurable +* Create corpus in build cache instead of maintaining it in parallel \ No newline at end of file diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..20d40b6 --- /dev/null +++ b/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. \ No newline at end of file diff --git a/COPYING.LESSER b/COPYING.LESSER new file mode 100644 index 0000000..20d40b6 --- /dev/null +++ b/COPYING.LESSER @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..3c05cb9 --- /dev/null +++ b/Gemfile @@ -0,0 +1,7 @@ +source 'http://rubygems.org' +ruby '2.1.3' +gem 'stemmer' + +group :test do + gem 'minitest' +end diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..21b10ec --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,12 @@ +GEM + remote: http://rubygems.org/ + specs: + minitest (5.4.2) + stemmer (1.0.1) + +PLATFORMS + ruby + +DEPENDENCIES + minitest + stemmer diff --git a/README.md b/README.md new file mode 100644 index 0000000..3688827 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Introduction +This is a Naive Bayes classifier that can be used to categorize text based on trained "pools". +Training counts how often each word is used, except for any specified stop words. +The Bayes::Bishop.guess method tokenizes the message and then calculates for each pool the probability that the message is the same "classification" as that pool. +For example, you could train the system with one pool of "spam" email and one pool of "non-spam" email. Then you could ask the guess method which pool each incoming message belongs to. + +# Usage +1. Create a Bishop::Bayes object: + + b = Bishop::Bayes.new + +2. Train with multiple pools of text: + + b.train('pool1') + b.train('pool2') + b.train('pool3') + +3. Call the guess method with a message to categorize: + + guesses = b.guess('This is a sentence') + + The return value is a hash where the keys are pool names and the values are the probability + that the message belongs to that pool. + +# Features +* Stop words may be specified + + b.add_stop_words(an_array_words) + b.add_stop_word('word') + +* You can include the default stop words list + + b.load_default_stop_words + +* You can choose between the default tokenizer, a stemming tokenizer, or a custom tokenizer + + b = Bishop::Bayes.new + b = Bishop::Bayes.new(Bishop::StemmingTokenizer) + b = Bishop::Bayes.new(CustomTokenizer) + diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..2eb8847 --- /dev/null +++ b/Rakefile @@ -0,0 +1,30 @@ +require 'rake/testtask' +require 'rdoc/task' + +#desc "Default task: test" +task :default => [:test] + +desc "Run Tests" +Rake::TestTask.new( :test ) do |t| + t.pattern = "test/test_*.rb" + t.verbose = true +end + +RDoc::Task.new(:rdoc) do |rdoc| + rdoc.main = 'README.md' + rdoc.rdoc_files.include 'README.md', 'CHANGELOG.md', "lib/**/*\.rb" + rdoc.rdoc_dir = 'docs/rdoc' + rdoc.title = "Bayes::Bishop Documentation" + rdoc.options << '--line-numbers' + rdoc.options << '--fileboxes' +end + +RDoc::Task.new(:rdoc => "rdoc_markdown",:clobber_rdoc => "clobber_rdoc_markdown", :rerdoc => "rerdoc_markdown") do |rdoc| + rdoc.main = 'README.md' + rdoc.rdoc_files.include 'README.md', 'CHANGELOG.md', "lib/**/*\.rb" + rdoc.rdoc_dir = 'docs/md' + rdoc.title = "Bayes::Bishop Documentation" + rdoc.markup = 'MARKUP' + rdoc.options << '--line-numbers' + rdoc.options << '--fileboxes' +end \ No newline at end of file diff --git a/bishop.gemspec b/bishop.gemspec index 1e4f7e6..b1ba8f2 100755 --- a/bishop.gemspec +++ b/bishop.gemspec @@ -2,21 +2,21 @@ require 'rubygems' SPEC = Gem::Specification.new do |s| s.name = "bishop" - s.version = "0.4.0" - s.author = "Matt Mower" - s.email = "self@mattmower.com" - s.homepage = "http://rubyforge.org/projects/bishop/" + s.version = "0.5.0" + s.author = "Richard Harrington" + s.email = "richard@maymount.com" + s.license = 'LGPL-3.0+' + s.homepage = "https://github.com/maymount/bishop" s.platform = Gem::Platform::RUBY - s.summary = "Bayesian classification and ART-2 clustering library." - - candidates = Dir.glob( "{bin,docs,lib,test}/**/*" ) + s.summary = "Bayesian classification library. Refactoring of mmowers/bishop version." + s.description = "Bayesian classification library. Refactoring of mmowers/bishop version." + s.add_runtime_dependency 'stemmer' + candidates = Dir.glob( "{docs,lib,test}/**/*" ) s.files = candidates.delete_if do |item| item.include?( "CVS" ) || item.include?( "rdoc" ) end + s.extra_rdoc_files = ['README.md','CHANGELOG.md','COPYING','COPYING.LESSER'] s.require_path = "lib" -# s.autorequire = "bishop" s.has_rdoc = true - - #s.add_dependency( "stemmer", ">= 1.0.1" ) end \ No newline at end of file diff --git a/docs/md/Bishop.html b/docs/md/Bishop.html new file mode 100644 index 0000000..030df55 --- /dev/null +++ b/docs/md/Bishop.html @@ -0,0 +1,202 @@ + + + + + + +module Bishop - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ module Bishop +

+ +
+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Class Methods

+
+ + +
+ +
+ robinson( probs, ignore ) + + click to toggle source + +
+ + +
+ +

default “combiner” set in initialize ignore is truly ignored

+ + + + +
+
# File lib/bayes/bishop.rb, line 419
+def self.robinson( probs, ignore )
+  nth = 1.0/probs.length
+  what_is_p = 1.0 - probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s * v } ** nth
+  what_is_q = 1.0 - probs.map { |p| p[1] }.inject { |s,v| s * v } ** nth
+  what_is_s = ( what_is_p - what_is_q ) / ( what_is_p + what_is_q )
+  ( 1 + what_is_s ) / 2
+end
+
+ +
+ + + + +
+ + +
+ +
+ robinson_fisher( probs, ignore ) + + click to toggle source + +
+ + +
+ +

Alternative combiner

+ + + + +
+
# File lib/bayes/bishop.rb, line 428
+def self.robinson_fisher( probs, ignore )
+  n = probs.length
+  
+  begin
+    h = chi2p( -2.0 * Math.log( probs.map { |p| p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
+  rescue
+    h = 0.0
+  end
+
+  begin      
+    s = chi2p( -2.0 * Math.log( probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
+  rescue
+    s = 0.0
+  end
+  
+  ( 1 + h - s ) / 2
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/md/Bishop/Bayes.html b/docs/md/Bishop/Bayes.html new file mode 100644 index 0000000..3ac61bd --- /dev/null +++ b/docs/md/Bishop/Bayes.html @@ -0,0 +1,894 @@ + + + + + + +class Bishop::Bayes - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::Bayes +

+ +
+ +
+ + + + +
+ + + + + + + +
+
+

Attributes

+
+ + +
+
+ combiner[RW] +
+ +
+ +

Block called to combine probabilities. Set to Bishop.robinson by default

+ +
+
+ +
+
+ stop_words[R] +
+ +
+ +

An array containing stop words that the tokenizer will ignore

+ +
+
+ +
+
+ tokenizer[RW] +
+ +
+ +

instance of Tokenizer that handles tokenization

+ +
+
+ +
+ + + +
+
+

Public Class Methods

+
+ + +
+ +
+ new( tokenizer = SimpleTokenizer, &combiner ) + + click to toggle source + +
+ + +
+ +

tokenizer is the name of the class that will separate the input into +tokens. See SimpleTokenizer and StemmingTokenizer for more information. +Combiner defaults to a block that calls Bishop.robinson

+ + + + +
+
# File lib/bayes/bishop.rb, line 139
+def initialize( tokenizer = SimpleTokenizer, &combiner )
+  @tokenizer = tokenizer.new
+  @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) }
+  @pools = {} # hash, key = pool name, value = BayesPool class

+  @cache = {} # created by calling build_cache, contains raw probabilities

+  @corpus_data = nil # created when corpus method is called, contains token totals

+  @dirty = true # indicates that cache and corpus_data are invalid

+  @stop_words = []
+end
+
+ +
+ + + + +
+ + +
+ +
+
+

Public Instance Methods

+
+ + +
+ +
+ add_stop_word( word ) + + click to toggle source + +
+ + +
+ +

Add the specified stop word

+ + + + +
+
# File lib/bayes/bishop.rb, line 199
+def add_stop_word( word )
+  @stop_words << word.downcase if !@stop_words.include?(word.downcase)
+end
+
+ +
+ + + + +
+ + +
+ +
+ add_stop_words( words ) + + click to toggle source + +
+ + +
+ +

Add an array of stop words

+ + + + +
+
# File lib/bayes/bishop.rb, line 194
+def add_stop_words( words )
+  words.each { |word| add_stop_word word if !word.empty? }
+end
+
+ +
+ + + + +
+ + +
+ +
+ guess( msg ) + + click to toggle source + +
+ + +
+ +

Call this method to classify a “message”. The return value will be a hash, +with the pool name is the key and the probability is the value, for each +pool which is a likely match for the message.

+ + + + +
+
# File lib/bayes/bishop.rb, line 302
+def guess( msg )
+  tokens = get_tokens( msg )
+  res = {}
+  
+  build_cache if dirty?
+  
+  @cache.each do |pool_name,pool|
+    p = get_probs( pool, tokens )
+    if p.length != 0
+      res[pool_name] = @combiner.call( p, pool_name )
+    end    
+  end
+  
+  h = Hash.new
+  res.sort.each { |a| h[a[0]] = a[1] }
+  h
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_default_stop_words() + + click to toggle source + +
+ + +
+ +

Load the default stop word list included with Bishop

+ + + + +
+
# File lib/bayes/bishop.rb, line 209
+def load_default_stop_words
+  load_stop_words( File.join( File.dirname( __FILE__ ), 'stopwords.yml' ) )
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_stop_words( source ) + + click to toggle source + +
+ + +
+ +

Load stopwords from the specified YAML formatted file

+ + + + +
+
# File lib/bayes/bishop.rb, line 204
+def load_stop_words( source )
+  File.open( source ) { |f| add_stop_words( YAML.load( f ) ) }
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_yaml( file = 'bayesdata.yml' ) + + click to toggle source + +
+ + +
+ +

Load the current state from a YAML file, default = 'bayesdata.yml'

+ + + + +
+
# File lib/bayes/bishop.rb, line 258
+def load_yaml( file = 'bayesdata.yml' )
+  begin
+    File.open( file ) { |f| load_data( f ) }
+  rescue Errno::ENOENT
+    # File does not exist

+  end
+end
+
+ +
+ + + + +
+ + +
+ +
+ merge_pools( dest_name, source_name ) + + click to toggle source + +
+ + +
+ +

Merge the contents of the source pool into the destination destination +pool.

+ + + + +
+
# File lib/bayes/bishop.rb, line 184
+def merge_pools( dest_name, source_name )
+  @pools[dest_name].merge(@pools[source_name])
+  @dirty = true  
+end
+
+ +
+ + + + +
+ + +
+ +
+ new_pool( pool_name ) + + click to toggle source + +
+ + +
+ +

Create a new, empty, pool without training.

+ + + + +
+
# File lib/bayes/bishop.rb, line 165
+def new_pool( pool_name )
+  @dirty = true
+  @pools[ pool_name ] ||= BayesPool.new
+end
+
+ +
+ + + + +
+ + +
+ +
+ pool(pool_name) + + click to toggle source + +
+ + +
+ +

Get the pool specified by name

+ + + + +
+
# File lib/bayes/bishop.rb, line 155
+def pool pool_name
+  @pools[pool_name]
+end
+
+ +
+ + + + +
+ + +
+ +
+ pool_names() + + click to toggle source + +
+ + +
+ +

Get a list of pools

+ + + + +
+
# File lib/bayes/bishop.rb, line 160
+def pool_names
+  @pools.keys.sort
+      end
+
+ +
+ + + + +
+ + +
+ +
+ remove_pool( pool_name ) + + click to toggle source + +
+ + +
+ +

Remove the given pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 171
+def remove_pool( pool_name )
+  @dirty = true
+  @pools.delete( pool_name ) 
+end
+
+ +
+ + + + +
+ + +
+ +
+ rename_pool( pool_name, new_name ) + + click to toggle source + +
+ + +
+ +

Rename the given pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 177
+def rename_pool( pool_name, new_name )
+  @pools[new_name] = @pools[pool_name]
+  @pools.delete( pool_name )
+  @dirty = true
+end
+
+ +
+ + + + +
+ + +
+ +
+ save_yaml( file = 'bayesdata.yml' ) + + click to toggle source + +
+ + +
+ +

Save the current state to a YAML file, default = 'bayesdata.yml'

+ + + + +
+
# File lib/bayes/bishop.rb, line 253
+def save_yaml( file = 'bayesdata.yml' )
+  File.open( file, 'w' ) { |f| f << to_yaml }
+end
+
+ +
+ + + + +
+ + +
+ +
+ to_json() + + click to toggle source + +
+ + +
+ +

Gets the current state in JSON format

+ + + + +
+
# File lib/bayes/bishop.rb, line 248
+def to_json
+  JSON.pretty_generate(export)
+end
+
+ +
+ + + + +
+ + +
+ +
+ to_yaml() + + click to toggle source + +
+ + +
+ +

Gets the current state in YAML format

+ + + + +
+
# File lib/bayes/bishop.rb, line 243
+def to_yaml
+  export.to_yaml
+end
+
+ +
+ + + + +
+ + +
+ +
+ train( pool_name, input ) + + click to toggle source + +
+ + +
+ +

Train the specified pool with the given input.

+ +

If the input is a string it is passed through the configured Tokenizer. +Otherwise, if it is an array it is just added.

+ + + + +
+
# File lib/bayes/bishop.rb, line 273
+def train( pool_name, input )
+  tokens = input.is_a?(String) ? get_tokens( input ) : input
+  pool = new_pool( pool_name )
+  train_( pool, tokens )
+  pool.train_count += 1
+  @dirty = true
+end
+
+ +
+ + + + +
+ + +
+ +
+ trained_on?( token ) + + click to toggle source + +
+ + +
+ +

Returns true if the specified token has been trained for any pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 294
+def trained_on?( token )
+  build_cache if @dirty
+  @cache.values.any? { |v| v.trained_on? token }
+end
+
+ +
+ + + + +
+ + +
+ +
+ untrain( pool_name, input ) + + click to toggle source + +
+ + +
+ +

Remove the input from the given pool If the input is a string it is passed +through the configured Tokenizer. Otherwise, if it is an array it is just +added.

+ + + + +
+
# File lib/bayes/bishop.rb, line 284
+def untrain( pool_name, input )
+  pool = find_pool( pool_name )
+  return if !pool
+  tokens = input.is_a?(String) ? get_tokens( input ) : input
+  untrain_( pool, tokens )
+  pool.train_count -= 1
+  @dirty = true  
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/md/Bishop/SimpleTokenizer.html b/docs/md/Bishop/SimpleTokenizer.html new file mode 100644 index 0000000..d0563f4 --- /dev/null +++ b/docs/md/Bishop/SimpleTokenizer.html @@ -0,0 +1,161 @@ + + + + + + +class Bishop::SimpleTokenizer - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::SimpleTokenizer +

+ +
+ +

A tokenizer class which splits words removing non word characters except +hyphens.

+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Instance Methods

+
+ + +
+ +
+ tokenize( item, stop_words=[] ) + + click to toggle source + +
+ + +
+ + + + + + +
+
# File lib/bayes/bishop.rb, line 107
+def tokenize( item, stop_words=[] )
+  item.split( /\s+/ ).map do |i|
+    i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" )
+  end.reject { |t| t == "" || t == "-" || stop_words.detect { |w| w == t } }
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/md/Bishop/StemmingTokenizer.html b/docs/md/Bishop/StemmingTokenizer.html new file mode 100644 index 0000000..a86b6a5 --- /dev/null +++ b/docs/md/Bishop/StemmingTokenizer.html @@ -0,0 +1,164 @@ + + + + + + +class Bishop::StemmingTokenizer - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::StemmingTokenizer +

+ +
+ +

A tokenizer which, having split words, reduces them to porter stemmed +tokens

+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Instance Methods

+
+ + +
+ +
+ tokenize( item, stop_words=[] ) + + click to toggle source + +
+ + +
+ + + + +
+ Calls superclass method + Bishop::SimpleTokenizer#tokenize +
+ + + +
+
# File lib/bayes/bishop.rb, line 116
+def tokenize( item, stop_words=[] )
+  super( item, stop_words ).map { |word| word.stem }
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/md/CHANGELOG_md.html b/docs/md/CHANGELOG_md.html new file mode 100644 index 0000000..8822b29 --- /dev/null +++ b/docs/md/CHANGELOG_md.html @@ -0,0 +1,119 @@ + + + + + + +CHANGELOG - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ +

Revisions

+ +

0.5.0

+
  • +

    Lots of refactoring, cleanup, making methods private, etc.

    +
  • +

    Added tests

    +
  • +

    Removed uid from train/untrain - couldn't think of a good use case, and +the logic didn't seem right since the system doesn't keep track of +which call to train created a token the untrain option would blindly remove +them.

    +
  • +

    Changed BayesData to BayesPool since that seems more explanatory

    +
  • +

    Moved some pool manipulation functions into BayesPool for better +encapsulation

    +
  • +

    Add to_json method

    +
  • +

    Removed data_class from Bayes initializer since I couldn't think of a +reason to make that configurable

    +
  • +

    Create corpus in build cache instead of maintaining it in parallel

    +
+
+ + + + + diff --git a/docs/md/README_md.html b/docs/md/README_md.html new file mode 100644 index 0000000..4e05df2 --- /dev/null +++ b/docs/md/README_md.html @@ -0,0 +1,143 @@ + + + + + + +README - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ +

Introduction

+ +

This is a Naive Bayes classifier that can be used to categorize text based +on trained “pools”. Training counts how often each word is used, except for +any specified stop words. The Bayes::Bishop.guess method tokenizes the +message and then calculates for each pool the probability that the message +is the same “classification” as that pool. For example, you could train the +system with one pool of “spam” email and one pool of “non-spam” email. Then +you could ask the guess method which pool each incoming message belongs to.

+ +

Usage

+
  1. +

    Create a Bishop::Bayes object:

    + +
    b = Bishop::Bayes.new
    +
  2. +

    Train with multiple pools of text:

    + +
    b.train('pool1')  
    +b.train('pool2')  
    +b.train('pool3')
    +
  3. +

    Call the guess method with a message to categorize:

    + +
    guesses = b.guess('This is a sentence')
    +
+ +

The return value is a hash where the keys are pool names and the values are +the probability that the message belongs to that pool.

+ +

Features

+
  • +

    Stop words may be specified

    + +
    b.add_stop_words(an_array_words)  
    +b.add_stop_word('word')
    +
  • +

    You can include the default stop words list

    + +
    b.load_default_stop_words
    +
  • +

    You can choose between the default tokenizer, a stemming tokenizer, or a +custom tokenizer

    + +
    b = Bishop::Bayes.new
    +b = Bishop::Bayes.new(Bishop::StemmingTokenizer)
    +b = Bishop::Bayes.new(CustomTokenizer)
    +
+
+ + + + + diff --git a/docs/md/created.rid b/docs/md/created.rid new file mode 100644 index 0000000..427b56f --- /dev/null +++ b/docs/md/created.rid @@ -0,0 +1,4 @@ +Sun, 02 Nov 2014 22:30:34 -0800 +README.md Sun, 02 Nov 2014 22:18:55 -0800 +CHANGELOG.md Sun, 02 Nov 2014 21:13:42 -0800 +lib/bayes/bishop.rb Sun, 02 Nov 2014 22:30:31 -0800 diff --git a/docs/md/fonts.css b/docs/md/fonts.css new file mode 100644 index 0000000..e9e7211 --- /dev/null +++ b/docs/md/fonts.css @@ -0,0 +1,167 @@ +/* + * Copyright 2010, 2012 Adobe Systems Incorporated (http://www.adobe.com/), + * with Reserved Font Name "Source". All Rights Reserved. Source is a + * trademark of Adobe Systems Incorporated in the United States and/or other + * countries. + * + * This Font Software is licensed under the SIL Open Font License, Version + * 1.1. + * + * This license is copied below, and is also available with a FAQ at: + * http://scripts.sil.org/OFL + */ + +@font-face { + font-family: "Source Code Pro"; + font-style: normal; + font-weight: 400; + src: local("Source Code Pro"), + local("SourceCodePro-Regular"), + url("fonts/SourceCodePro-Regular.ttf") format("truetype"); +} + +@font-face { + font-family: "Source Code Pro"; + font-style: normal; + font-weight: 700; + src: local("Source Code Pro Bold"), + local("SourceCodePro-Bold"), + url("fonts/SourceCodePro-Bold.ttf") format("truetype"); +} + +/* + * Copyright (c) 2010, Łukasz Dziedzic (dziedzic@typoland.com), + * with Reserved Font Name Lato. + * + * This Font Software is licensed under the SIL Open Font License, Version + * 1.1. + * + * This license is copied below, and is also available with a FAQ at: + * http://scripts.sil.org/OFL + */ + +@font-face { + font-family: "Lato"; + font-style: normal; + font-weight: 300; + src: local("Lato Light"), + local("Lato-Light"), + url("fonts/Lato-Light.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: italic; + font-weight: 300; + src: local("Lato Light Italic"), + local("Lato-LightItalic"), + url("fonts/Lato-LightItalic.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: normal; + font-weight: 700; + src: local("Lato Regular"), + local("Lato-Regular"), + url("fonts/Lato-Regular.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: italic; + font-weight: 700; + src: local("Lato Italic"), + local("Lato-Italic"), + url("fonts/Lato-RegularItalic.ttf") format("truetype"); +} + +/* + * ----------------------------------------------------------- + * SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 + * ----------------------------------------------------------- + * + * PREAMBLE + * The goals of the Open Font License (OFL) are to stimulate worldwide + * development of collaborative font projects, to support the font creation + * efforts of academic and linguistic communities, and to provide a free and + * open framework in which fonts may be shared and improved in partnership + * with others. + * + * The OFL allows the licensed fonts to be used, studied, modified and + * redistributed freely as long as they are not sold by themselves. The + * fonts, including any derivative works, can be bundled, embedded, + * redistributed and/or sold with any software provided that any reserved + * names are not used by derivative works. The fonts and derivatives, + * however, cannot be released under any other type of license. The + * requirement for fonts to remain under this license does not apply + * to any document created using the fonts or their derivatives. + * + * DEFINITIONS + * "Font Software" refers to the set of files released by the Copyright + * Holder(s) under this license and clearly marked as such. This may + * include source files, build scripts and documentation. + * + * "Reserved Font Name" refers to any names specified as such after the + * copyright statement(s). + * + * "Original Version" refers to the collection of Font Software components as + * distributed by the Copyright Holder(s). + * + * "Modified Version" refers to any derivative made by adding to, deleting, + * or substituting -- in part or in whole -- any of the components of the + * Original Version, by changing formats or by porting the Font Software to a + * new environment. + * + * "Author" refers to any designer, engineer, programmer, technical + * writer or other person who contributed to the Font Software. + * + * PERMISSION & CONDITIONS + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Font Software, to use, study, copy, merge, embed, modify, + * redistribute, and sell modified and unmodified copies of the Font + * Software, subject to the following conditions: + * + * 1) Neither the Font Software nor any of its individual components, + * in Original or Modified Versions, may be sold by itself. + * + * 2) Original or Modified Versions of the Font Software may be bundled, + * redistributed and/or sold with any software, provided that each copy + * contains the above copyright notice and this license. These can be + * included either as stand-alone text files, human-readable headers or + * in the appropriate machine-readable metadata fields within text or + * binary files as long as those fields can be easily viewed by the user. + * + * 3) No Modified Version of the Font Software may use the Reserved Font + * Name(s) unless explicit written permission is granted by the corresponding + * Copyright Holder. This restriction only applies to the primary font name as + * presented to the users. + * + * 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font + * Software shall not be used to promote, endorse or advertise any + * Modified Version, except to acknowledge the contribution(s) of the + * Copyright Holder(s) and the Author(s) or with their explicit written + * permission. + * + * 5) The Font Software, modified or unmodified, in part or in whole, + * must be distributed entirely under this license, and must not be + * distributed under any other license. The requirement for fonts to + * remain under this license does not apply to any document created + * using the Font Software. + * + * TERMINATION + * This license becomes null and void if any of the above conditions are + * not met. + * + * DISCLAIMER + * THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL + * DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM + * OTHER DEALINGS IN THE FONT SOFTWARE. + */ + diff --git a/docs/md/fonts/Lato-Light.ttf b/docs/md/fonts/Lato-Light.ttf new file mode 100644 index 0000000..b49dd43 Binary files /dev/null and b/docs/md/fonts/Lato-Light.ttf differ diff --git a/docs/md/fonts/Lato-LightItalic.ttf b/docs/md/fonts/Lato-LightItalic.ttf new file mode 100644 index 0000000..7959fef Binary files /dev/null and b/docs/md/fonts/Lato-LightItalic.ttf differ diff --git a/docs/md/fonts/Lato-Regular.ttf b/docs/md/fonts/Lato-Regular.ttf new file mode 100644 index 0000000..839cd58 Binary files /dev/null and b/docs/md/fonts/Lato-Regular.ttf differ diff --git a/docs/md/fonts/Lato-RegularItalic.ttf b/docs/md/fonts/Lato-RegularItalic.ttf new file mode 100644 index 0000000..bababa0 Binary files /dev/null and b/docs/md/fonts/Lato-RegularItalic.ttf differ diff --git a/docs/md/fonts/SourceCodePro-Bold.ttf b/docs/md/fonts/SourceCodePro-Bold.ttf new file mode 100644 index 0000000..61e3090 Binary files /dev/null and b/docs/md/fonts/SourceCodePro-Bold.ttf differ diff --git a/docs/md/fonts/SourceCodePro-Regular.ttf b/docs/md/fonts/SourceCodePro-Regular.ttf new file mode 100644 index 0000000..85686d9 Binary files /dev/null and b/docs/md/fonts/SourceCodePro-Regular.ttf differ diff --git a/docs/md/images/add.png b/docs/md/images/add.png new file mode 100644 index 0000000..6332fef Binary files /dev/null and b/docs/md/images/add.png differ diff --git a/docs/md/images/arrow_up.png b/docs/md/images/arrow_up.png new file mode 100644 index 0000000..1ebb193 Binary files /dev/null and b/docs/md/images/arrow_up.png differ diff --git a/docs/md/images/brick.png b/docs/md/images/brick.png new file mode 100644 index 0000000..7851cf3 Binary files /dev/null and b/docs/md/images/brick.png differ diff --git a/docs/md/images/brick_link.png b/docs/md/images/brick_link.png new file mode 100644 index 0000000..9ebf013 Binary files /dev/null and b/docs/md/images/brick_link.png differ diff --git a/docs/md/images/bug.png b/docs/md/images/bug.png new file mode 100644 index 0000000..2d5fb90 Binary files /dev/null and b/docs/md/images/bug.png differ diff --git a/docs/md/images/bullet_black.png b/docs/md/images/bullet_black.png new file mode 100644 index 0000000..5761970 Binary files /dev/null and b/docs/md/images/bullet_black.png differ diff --git a/docs/md/images/bullet_toggle_minus.png b/docs/md/images/bullet_toggle_minus.png new file mode 100644 index 0000000..b47ce55 Binary files /dev/null and b/docs/md/images/bullet_toggle_minus.png differ diff --git a/docs/md/images/bullet_toggle_plus.png b/docs/md/images/bullet_toggle_plus.png new file mode 100644 index 0000000..9ab4a89 Binary files /dev/null and b/docs/md/images/bullet_toggle_plus.png differ diff --git a/docs/md/images/date.png b/docs/md/images/date.png new file mode 100644 index 0000000..783c833 Binary files /dev/null and b/docs/md/images/date.png differ diff --git a/docs/md/images/delete.png b/docs/md/images/delete.png new file mode 100644 index 0000000..08f2493 Binary files /dev/null and b/docs/md/images/delete.png differ diff --git a/docs/md/images/find.png b/docs/md/images/find.png new file mode 100644 index 0000000..1547479 Binary files /dev/null and b/docs/md/images/find.png differ diff --git a/docs/md/images/loadingAnimation.gif b/docs/md/images/loadingAnimation.gif new file mode 100644 index 0000000..82290f4 Binary files /dev/null and b/docs/md/images/loadingAnimation.gif differ diff --git a/docs/md/images/macFFBgHack.png b/docs/md/images/macFFBgHack.png new file mode 100644 index 0000000..c6473b3 Binary files /dev/null and b/docs/md/images/macFFBgHack.png differ diff --git a/docs/md/images/package.png b/docs/md/images/package.png new file mode 100644 index 0000000..da3c2a2 Binary files /dev/null and b/docs/md/images/package.png differ diff --git a/docs/md/images/page_green.png b/docs/md/images/page_green.png new file mode 100644 index 0000000..de8e003 Binary files /dev/null and b/docs/md/images/page_green.png differ diff --git a/docs/md/images/page_white_text.png b/docs/md/images/page_white_text.png new file mode 100644 index 0000000..813f712 Binary files /dev/null and b/docs/md/images/page_white_text.png differ diff --git a/docs/md/images/page_white_width.png b/docs/md/images/page_white_width.png new file mode 100644 index 0000000..1eb8809 Binary files /dev/null and b/docs/md/images/page_white_width.png differ diff --git a/docs/md/images/plugin.png b/docs/md/images/plugin.png new file mode 100644 index 0000000..6187b15 Binary files /dev/null and b/docs/md/images/plugin.png differ diff --git a/docs/md/images/ruby.png b/docs/md/images/ruby.png new file mode 100644 index 0000000..f763a16 Binary files /dev/null and b/docs/md/images/ruby.png differ diff --git a/docs/md/images/tag_blue.png b/docs/md/images/tag_blue.png new file mode 100644 index 0000000..3f02b5f Binary files /dev/null and b/docs/md/images/tag_blue.png differ diff --git a/docs/md/images/tag_green.png b/docs/md/images/tag_green.png new file mode 100644 index 0000000..83ec984 Binary files /dev/null and b/docs/md/images/tag_green.png differ diff --git a/docs/md/images/transparent.png b/docs/md/images/transparent.png new file mode 100644 index 0000000..d665e17 Binary files /dev/null and b/docs/md/images/transparent.png differ diff --git a/docs/md/images/wrench.png b/docs/md/images/wrench.png new file mode 100644 index 0000000..5c8213f Binary files /dev/null and b/docs/md/images/wrench.png differ diff --git a/docs/md/images/wrench_orange.png b/docs/md/images/wrench_orange.png new file mode 100644 index 0000000..565a933 Binary files /dev/null and b/docs/md/images/wrench_orange.png differ diff --git a/docs/md/images/zoom.png b/docs/md/images/zoom.png new file mode 100644 index 0000000..908612e Binary files /dev/null and b/docs/md/images/zoom.png differ diff --git a/docs/md/index.html b/docs/md/index.html new file mode 100644 index 0000000..15687e8 --- /dev/null +++ b/docs/md/index.html @@ -0,0 +1,149 @@ + + + + + + +Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ + +

Introduction

+ +

This is a Naive Bayes classifier that can be used to categorize text based +on trained “pools”. Training counts how often each word is used, except for +any specified stop words. The Bayes::Bishop.guess method tokenizes the +message and then calculates for each pool the probability that the message +is the same “classification” as that pool. For example, you could train the +system with one pool of “spam” email and one pool of “non-spam” email. Then +you could ask the guess method which pool each incoming message belongs to.

+ +

Usage

+
  1. +

    Create a Bishop::Bayes object:

    + +
    b = Bishop::Bayes.new
    +
  2. +

    Train with multiple pools of text:

    + +
    b.train('pool1')  
    +b.train('pool2')  
    +b.train('pool3')
    +
  3. +

    Call the guess method with a message to categorize:

    + +
    guesses = b.guess('This is a sentence')
    +
+ +

The return value is a hash where the keys are pool names and the values are +the probability that the message belongs to that pool.

+ +

Features

+
  • +

    Stop words may be specified

    + +
    b.add_stop_words(an_array_words)  
    +b.add_stop_word('word')
    +
  • +

    You can include the default stop words list

    + +
    b.load_default_stop_words
    +
  • +

    You can choose between the default tokenizer, a stemming tokenizer, or a +custom tokenizer

    + +
    b = Bishop::Bayes.new
    +b = Bishop::Bayes.new(Bishop::StemmingTokenizer)
    +b = Bishop::Bayes.new(CustomTokenizer)
    +
+
+ + + + + diff --git a/docs/md/js/darkfish.js b/docs/md/js/darkfish.js new file mode 100644 index 0000000..06fef3b --- /dev/null +++ b/docs/md/js/darkfish.js @@ -0,0 +1,140 @@ +/** + * + * Darkfish Page Functions + * $Id: darkfish.js 53 2009-01-07 02:52:03Z deveiant $ + * + * Author: Michael Granger + * + */ + +/* Provide console simulation for firebug-less environments */ +if (!("console" in window) || !("firebug" in console)) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", "dirxml", + "group", "groupEnd", "time", "timeEnd", "count", "trace", "profile", "profileEnd"]; + + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +}; + + +/** + * Unwrap the first element that matches the given @expr@ from the targets and return them. + */ +$.fn.unwrap = function( expr ) { + return this.each( function() { + $(this).parents( expr ).eq( 0 ).after( this ).remove(); + }); +}; + + +function showSource( e ) { + var target = e.target; + var codeSections = $(target). + parents('.method-detail'). + find('.method-source-code'); + + $(target). + parents('.method-detail'). + find('.method-source-code'). + slideToggle(); +}; + +function hookSourceViews() { + $('.method-heading').click( showSource ); +}; + +function toggleDebuggingSection() { + $('.debugging-section').slideToggle(); +}; + +function hookDebuggingToggle() { + $('#debugging-toggle img').click( toggleDebuggingSection ); +}; + +function hookSearch() { + var input = $('#search-field').eq(0); + var result = $('#search-results').eq(0); + $(result).show(); + + var search_section = $('#search-section').get(0); + $(search_section).show(); + + var search = new Search(search_data, input, result); + + search.renderItem = function(result) { + var li = document.createElement('li'); + var html = ''; + + // TODO add relative path to + + + + + + + + + + +
+

Table of Contents - Bayes::Bishop Documentation

+ +

Pages

+ + +

Classes and Modules

+ + +

Methods

+ +
+ + + + diff --git a/docs/rdoc/Bishop.html b/docs/rdoc/Bishop.html new file mode 100644 index 0000000..030df55 --- /dev/null +++ b/docs/rdoc/Bishop.html @@ -0,0 +1,202 @@ + + + + + + +module Bishop - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ module Bishop +

+ +
+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Class Methods

+
+ + +
+ +
+ robinson( probs, ignore ) + + click to toggle source + +
+ + +
+ +

default “combiner” set in initialize ignore is truly ignored

+ + + + +
+
# File lib/bayes/bishop.rb, line 419
+def self.robinson( probs, ignore )
+  nth = 1.0/probs.length
+  what_is_p = 1.0 - probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s * v } ** nth
+  what_is_q = 1.0 - probs.map { |p| p[1] }.inject { |s,v| s * v } ** nth
+  what_is_s = ( what_is_p - what_is_q ) / ( what_is_p + what_is_q )
+  ( 1 + what_is_s ) / 2
+end
+
+ +
+ + + + +
+ + +
+ +
+ robinson_fisher( probs, ignore ) + + click to toggle source + +
+ + +
+ +

Alternative combiner

+ + + + +
+
# File lib/bayes/bishop.rb, line 428
+def self.robinson_fisher( probs, ignore )
+  n = probs.length
+  
+  begin
+    h = chi2p( -2.0 * Math.log( probs.map { |p| p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
+  rescue
+    h = 0.0
+  end
+
+  begin      
+    s = chi2p( -2.0 * Math.log( probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s*v } ), 2*n )
+  rescue
+    s = 0.0
+  end
+  
+  ( 1 + h - s ) / 2
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/rdoc/Bishop/Bayes.html b/docs/rdoc/Bishop/Bayes.html new file mode 100644 index 0000000..3ac61bd --- /dev/null +++ b/docs/rdoc/Bishop/Bayes.html @@ -0,0 +1,894 @@ + + + + + + +class Bishop::Bayes - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::Bayes +

+ +
+ +
+ + + + +
+ + + + + + + +
+
+

Attributes

+
+ + +
+
+ combiner[RW] +
+ +
+ +

Block called to combine probabilities. Set to Bishop.robinson by default

+ +
+
+ +
+
+ stop_words[R] +
+ +
+ +

An array containing stop words that the tokenizer will ignore

+ +
+
+ +
+
+ tokenizer[RW] +
+ +
+ +

instance of Tokenizer that handles tokenization

+ +
+
+ +
+ + + +
+
+

Public Class Methods

+
+ + +
+ +
+ new( tokenizer = SimpleTokenizer, &combiner ) + + click to toggle source + +
+ + +
+ +

tokenizer is the name of the class that will separate the input into +tokens. See SimpleTokenizer and StemmingTokenizer for more information. +Combiner defaults to a block that calls Bishop.robinson

+ + + + +
+
# File lib/bayes/bishop.rb, line 139
+def initialize( tokenizer = SimpleTokenizer, &combiner )
+  @tokenizer = tokenizer.new
+  @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) }
+  @pools = {} # hash, key = pool name, value = BayesPool class

+  @cache = {} # created by calling build_cache, contains raw probabilities

+  @corpus_data = nil # created when corpus method is called, contains token totals

+  @dirty = true # indicates that cache and corpus_data are invalid

+  @stop_words = []
+end
+
+ +
+ + + + +
+ + +
+ +
+
+

Public Instance Methods

+
+ + +
+ +
+ add_stop_word( word ) + + click to toggle source + +
+ + +
+ +

Add the specified stop word

+ + + + +
+
# File lib/bayes/bishop.rb, line 199
+def add_stop_word( word )
+  @stop_words << word.downcase if !@stop_words.include?(word.downcase)
+end
+
+ +
+ + + + +
+ + +
+ +
+ add_stop_words( words ) + + click to toggle source + +
+ + +
+ +

Add an array of stop words

+ + + + +
+
# File lib/bayes/bishop.rb, line 194
+def add_stop_words( words )
+  words.each { |word| add_stop_word word if !word.empty? }
+end
+
+ +
+ + + + +
+ + +
+ +
+ guess( msg ) + + click to toggle source + +
+ + +
+ +

Call this method to classify a “message”. The return value will be a hash, +with the pool name is the key and the probability is the value, for each +pool which is a likely match for the message.

+ + + + +
+
# File lib/bayes/bishop.rb, line 302
+def guess( msg )
+  tokens = get_tokens( msg )
+  res = {}
+  
+  build_cache if dirty?
+  
+  @cache.each do |pool_name,pool|
+    p = get_probs( pool, tokens )
+    if p.length != 0
+      res[pool_name] = @combiner.call( p, pool_name )
+    end    
+  end
+  
+  h = Hash.new
+  res.sort.each { |a| h[a[0]] = a[1] }
+  h
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_default_stop_words() + + click to toggle source + +
+ + +
+ +

Load the default stop word list included with Bishop

+ + + + +
+
# File lib/bayes/bishop.rb, line 209
+def load_default_stop_words
+  load_stop_words( File.join( File.dirname( __FILE__ ), 'stopwords.yml' ) )
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_stop_words( source ) + + click to toggle source + +
+ + +
+ +

Load stopwords from the specified YAML formatted file

+ + + + +
+
# File lib/bayes/bishop.rb, line 204
+def load_stop_words( source )
+  File.open( source ) { |f| add_stop_words( YAML.load( f ) ) }
+end
+
+ +
+ + + + +
+ + +
+ +
+ load_yaml( file = 'bayesdata.yml' ) + + click to toggle source + +
+ + +
+ +

Load the current state from a YAML file, default = 'bayesdata.yml'

+ + + + +
+
# File lib/bayes/bishop.rb, line 258
+def load_yaml( file = 'bayesdata.yml' )
+  begin
+    File.open( file ) { |f| load_data( f ) }
+  rescue Errno::ENOENT
+    # File does not exist

+  end
+end
+
+ +
+ + + + +
+ + +
+ +
+ merge_pools( dest_name, source_name ) + + click to toggle source + +
+ + +
+ +

Merge the contents of the source pool into the destination destination +pool.

+ + + + +
+
# File lib/bayes/bishop.rb, line 184
+def merge_pools( dest_name, source_name )
+  @pools[dest_name].merge(@pools[source_name])
+  @dirty = true  
+end
+
+ +
+ + + + +
+ + +
+ +
+ new_pool( pool_name ) + + click to toggle source + +
+ + +
+ +

Create a new, empty, pool without training.

+ + + + +
+
# File lib/bayes/bishop.rb, line 165
+def new_pool( pool_name )
+  @dirty = true
+  @pools[ pool_name ] ||= BayesPool.new
+end
+
+ +
+ + + + +
+ + +
+ +
+ pool(pool_name) + + click to toggle source + +
+ + +
+ +

Get the pool specified by name

+ + + + +
+
# File lib/bayes/bishop.rb, line 155
+def pool pool_name
+  @pools[pool_name]
+end
+
+ +
+ + + + +
+ + +
+ +
+ pool_names() + + click to toggle source + +
+ + +
+ +

Get a list of pools

+ + + + +
+
# File lib/bayes/bishop.rb, line 160
+def pool_names
+  @pools.keys.sort
+      end
+
+ +
+ + + + +
+ + +
+ +
+ remove_pool( pool_name ) + + click to toggle source + +
+ + +
+ +

Remove the given pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 171
+def remove_pool( pool_name )
+  @dirty = true
+  @pools.delete( pool_name ) 
+end
+
+ +
+ + + + +
+ + +
+ +
+ rename_pool( pool_name, new_name ) + + click to toggle source + +
+ + +
+ +

Rename the given pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 177
+def rename_pool( pool_name, new_name )
+  @pools[new_name] = @pools[pool_name]
+  @pools.delete( pool_name )
+  @dirty = true
+end
+
+ +
+ + + + +
+ + +
+ +
+ save_yaml( file = 'bayesdata.yml' ) + + click to toggle source + +
+ + +
+ +

Save the current state to a YAML file, default = 'bayesdata.yml'

+ + + + +
+
# File lib/bayes/bishop.rb, line 253
+def save_yaml( file = 'bayesdata.yml' )
+  File.open( file, 'w' ) { |f| f << to_yaml }
+end
+
+ +
+ + + + +
+ + +
+ +
+ to_json() + + click to toggle source + +
+ + +
+ +

Gets the current state in JSON format

+ + + + +
+
# File lib/bayes/bishop.rb, line 248
+def to_json
+  JSON.pretty_generate(export)
+end
+
+ +
+ + + + +
+ + +
+ +
+ to_yaml() + + click to toggle source + +
+ + +
+ +

Gets the current state in YAML format

+ + + + +
+
# File lib/bayes/bishop.rb, line 243
+def to_yaml
+  export.to_yaml
+end
+
+ +
+ + + + +
+ + +
+ +
+ train( pool_name, input ) + + click to toggle source + +
+ + +
+ +

Train the specified pool with the given input.

+ +

If the input is a string it is passed through the configured Tokenizer. +Otherwise, if it is an array it is just added.

+ + + + +
+
# File lib/bayes/bishop.rb, line 273
+def train( pool_name, input )
+  tokens = input.is_a?(String) ? get_tokens( input ) : input
+  pool = new_pool( pool_name )
+  train_( pool, tokens )
+  pool.train_count += 1
+  @dirty = true
+end
+
+ +
+ + + + +
+ + +
+ +
+ trained_on?( token ) + + click to toggle source + +
+ + +
+ +

Returns true if the specified token has been trained for any pool

+ + + + +
+
# File lib/bayes/bishop.rb, line 294
+def trained_on?( token )
+  build_cache if @dirty
+  @cache.values.any? { |v| v.trained_on? token }
+end
+
+ +
+ + + + +
+ + +
+ +
+ untrain( pool_name, input ) + + click to toggle source + +
+ + +
+ +

Remove the input from the given pool If the input is a string it is passed +through the configured Tokenizer. Otherwise, if it is an array it is just +added.

+ + + + +
+
# File lib/bayes/bishop.rb, line 284
+def untrain( pool_name, input )
+  pool = find_pool( pool_name )
+  return if !pool
+  tokens = input.is_a?(String) ? get_tokens( input ) : input
+  untrain_( pool, tokens )
+  pool.train_count -= 1
+  @dirty = true  
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/rdoc/Bishop/SimpleTokenizer.html b/docs/rdoc/Bishop/SimpleTokenizer.html new file mode 100644 index 0000000..d0563f4 --- /dev/null +++ b/docs/rdoc/Bishop/SimpleTokenizer.html @@ -0,0 +1,161 @@ + + + + + + +class Bishop::SimpleTokenizer - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::SimpleTokenizer +

+ +
+ +

A tokenizer class which splits words removing non word characters except +hyphens.

+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Instance Methods

+
+ + +
+ +
+ tokenize( item, stop_words=[] ) + + click to toggle source + +
+ + +
+ + + + + + +
+
# File lib/bayes/bishop.rb, line 107
+def tokenize( item, stop_words=[] )
+  item.split( /\s+/ ).map do |i|
+    i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" )
+  end.reject { |t| t == "" || t == "-" || stop_words.detect { |w| w == t } }
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/rdoc/Bishop/StemmingTokenizer.html b/docs/rdoc/Bishop/StemmingTokenizer.html new file mode 100644 index 0000000..a86b6a5 --- /dev/null +++ b/docs/rdoc/Bishop/StemmingTokenizer.html @@ -0,0 +1,164 @@ + + + + + + +class Bishop::StemmingTokenizer - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+

+ class Bishop::StemmingTokenizer +

+ +
+ +

A tokenizer which, having split words, reduces them to porter stemmed +tokens

+ +
+ + + + +
+ + + + + + + + + +
+
+

Public Instance Methods

+
+ + +
+ +
+ tokenize( item, stop_words=[] ) + + click to toggle source + +
+ + +
+ + + + +
+ Calls superclass method + Bishop::SimpleTokenizer#tokenize +
+ + + +
+
# File lib/bayes/bishop.rb, line 116
+def tokenize( item, stop_words=[] )
+  super( item, stop_words ).map { |word| word.stem }
+end
+
+ +
+ + + + +
+ + +
+ +
+
+ + + + diff --git a/docs/rdoc/CHANGELOG_md.html b/docs/rdoc/CHANGELOG_md.html new file mode 100644 index 0000000..8822b29 --- /dev/null +++ b/docs/rdoc/CHANGELOG_md.html @@ -0,0 +1,119 @@ + + + + + + +CHANGELOG - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ +

Revisions

+ +

0.5.0

+
  • +

    Lots of refactoring, cleanup, making methods private, etc.

    +
  • +

    Added tests

    +
  • +

    Removed uid from train/untrain - couldn't think of a good use case, and +the logic didn't seem right since the system doesn't keep track of +which call to train created a token the untrain option would blindly remove +them.

    +
  • +

    Changed BayesData to BayesPool since that seems more explanatory

    +
  • +

    Moved some pool manipulation functions into BayesPool for better +encapsulation

    +
  • +

    Add to_json method

    +
  • +

    Removed data_class from Bayes initializer since I couldn't think of a +reason to make that configurable

    +
  • +

    Create corpus in build cache instead of maintaining it in parallel

    +
+
+ + + + + diff --git a/docs/rdoc/README_md.html b/docs/rdoc/README_md.html new file mode 100644 index 0000000..4e05df2 --- /dev/null +++ b/docs/rdoc/README_md.html @@ -0,0 +1,143 @@ + + + + + + +README - Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ +

Introduction

+ +

This is a Naive Bayes classifier that can be used to categorize text based +on trained “pools”. Training counts how often each word is used, except for +any specified stop words. The Bayes::Bishop.guess method tokenizes the +message and then calculates for each pool the probability that the message +is the same “classification” as that pool. For example, you could train the +system with one pool of “spam” email and one pool of “non-spam” email. Then +you could ask the guess method which pool each incoming message belongs to.

+ +

Usage

+
  1. +

    Create a Bishop::Bayes object:

    + +
    b = Bishop::Bayes.new
    +
  2. +

    Train with multiple pools of text:

    + +
    b.train('pool1')  
    +b.train('pool2')  
    +b.train('pool3')
    +
  3. +

    Call the guess method with a message to categorize:

    + +
    guesses = b.guess('This is a sentence')
    +
+ +

The return value is a hash where the keys are pool names and the values are +the probability that the message belongs to that pool.

+ +

Features

+
  • +

    Stop words may be specified

    + +
    b.add_stop_words(an_array_words)  
    +b.add_stop_word('word')
    +
  • +

    You can include the default stop words list

    + +
    b.load_default_stop_words
    +
  • +

    You can choose between the default tokenizer, a stemming tokenizer, or a +custom tokenizer

    + +
    b = Bishop::Bayes.new
    +b = Bishop::Bayes.new(Bishop::StemmingTokenizer)
    +b = Bishop::Bayes.new(CustomTokenizer)
    +
+
+ + + + + diff --git a/docs/rdoc/created.rid b/docs/rdoc/created.rid new file mode 100644 index 0000000..427b56f --- /dev/null +++ b/docs/rdoc/created.rid @@ -0,0 +1,4 @@ +Sun, 02 Nov 2014 22:30:34 -0800 +README.md Sun, 02 Nov 2014 22:18:55 -0800 +CHANGELOG.md Sun, 02 Nov 2014 21:13:42 -0800 +lib/bayes/bishop.rb Sun, 02 Nov 2014 22:30:31 -0800 diff --git a/docs/rdoc/fonts.css b/docs/rdoc/fonts.css new file mode 100644 index 0000000..e9e7211 --- /dev/null +++ b/docs/rdoc/fonts.css @@ -0,0 +1,167 @@ +/* + * Copyright 2010, 2012 Adobe Systems Incorporated (http://www.adobe.com/), + * with Reserved Font Name "Source". All Rights Reserved. Source is a + * trademark of Adobe Systems Incorporated in the United States and/or other + * countries. + * + * This Font Software is licensed under the SIL Open Font License, Version + * 1.1. + * + * This license is copied below, and is also available with a FAQ at: + * http://scripts.sil.org/OFL + */ + +@font-face { + font-family: "Source Code Pro"; + font-style: normal; + font-weight: 400; + src: local("Source Code Pro"), + local("SourceCodePro-Regular"), + url("fonts/SourceCodePro-Regular.ttf") format("truetype"); +} + +@font-face { + font-family: "Source Code Pro"; + font-style: normal; + font-weight: 700; + src: local("Source Code Pro Bold"), + local("SourceCodePro-Bold"), + url("fonts/SourceCodePro-Bold.ttf") format("truetype"); +} + +/* + * Copyright (c) 2010, Łukasz Dziedzic (dziedzic@typoland.com), + * with Reserved Font Name Lato. + * + * This Font Software is licensed under the SIL Open Font License, Version + * 1.1. + * + * This license is copied below, and is also available with a FAQ at: + * http://scripts.sil.org/OFL + */ + +@font-face { + font-family: "Lato"; + font-style: normal; + font-weight: 300; + src: local("Lato Light"), + local("Lato-Light"), + url("fonts/Lato-Light.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: italic; + font-weight: 300; + src: local("Lato Light Italic"), + local("Lato-LightItalic"), + url("fonts/Lato-LightItalic.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: normal; + font-weight: 700; + src: local("Lato Regular"), + local("Lato-Regular"), + url("fonts/Lato-Regular.ttf") format("truetype"); +} + +@font-face { + font-family: "Lato"; + font-style: italic; + font-weight: 700; + src: local("Lato Italic"), + local("Lato-Italic"), + url("fonts/Lato-RegularItalic.ttf") format("truetype"); +} + +/* + * ----------------------------------------------------------- + * SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 + * ----------------------------------------------------------- + * + * PREAMBLE + * The goals of the Open Font License (OFL) are to stimulate worldwide + * development of collaborative font projects, to support the font creation + * efforts of academic and linguistic communities, and to provide a free and + * open framework in which fonts may be shared and improved in partnership + * with others. + * + * The OFL allows the licensed fonts to be used, studied, modified and + * redistributed freely as long as they are not sold by themselves. The + * fonts, including any derivative works, can be bundled, embedded, + * redistributed and/or sold with any software provided that any reserved + * names are not used by derivative works. The fonts and derivatives, + * however, cannot be released under any other type of license. The + * requirement for fonts to remain under this license does not apply + * to any document created using the fonts or their derivatives. + * + * DEFINITIONS + * "Font Software" refers to the set of files released by the Copyright + * Holder(s) under this license and clearly marked as such. This may + * include source files, build scripts and documentation. + * + * "Reserved Font Name" refers to any names specified as such after the + * copyright statement(s). + * + * "Original Version" refers to the collection of Font Software components as + * distributed by the Copyright Holder(s). + * + * "Modified Version" refers to any derivative made by adding to, deleting, + * or substituting -- in part or in whole -- any of the components of the + * Original Version, by changing formats or by porting the Font Software to a + * new environment. + * + * "Author" refers to any designer, engineer, programmer, technical + * writer or other person who contributed to the Font Software. + * + * PERMISSION & CONDITIONS + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Font Software, to use, study, copy, merge, embed, modify, + * redistribute, and sell modified and unmodified copies of the Font + * Software, subject to the following conditions: + * + * 1) Neither the Font Software nor any of its individual components, + * in Original or Modified Versions, may be sold by itself. + * + * 2) Original or Modified Versions of the Font Software may be bundled, + * redistributed and/or sold with any software, provided that each copy + * contains the above copyright notice and this license. These can be + * included either as stand-alone text files, human-readable headers or + * in the appropriate machine-readable metadata fields within text or + * binary files as long as those fields can be easily viewed by the user. + * + * 3) No Modified Version of the Font Software may use the Reserved Font + * Name(s) unless explicit written permission is granted by the corresponding + * Copyright Holder. This restriction only applies to the primary font name as + * presented to the users. + * + * 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font + * Software shall not be used to promote, endorse or advertise any + * Modified Version, except to acknowledge the contribution(s) of the + * Copyright Holder(s) and the Author(s) or with their explicit written + * permission. + * + * 5) The Font Software, modified or unmodified, in part or in whole, + * must be distributed entirely under this license, and must not be + * distributed under any other license. The requirement for fonts to + * remain under this license does not apply to any document created + * using the Font Software. + * + * TERMINATION + * This license becomes null and void if any of the above conditions are + * not met. + * + * DISCLAIMER + * THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL + * DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM + * OTHER DEALINGS IN THE FONT SOFTWARE. + */ + diff --git a/docs/rdoc/fonts/Lato-Light.ttf b/docs/rdoc/fonts/Lato-Light.ttf new file mode 100644 index 0000000..b49dd43 Binary files /dev/null and b/docs/rdoc/fonts/Lato-Light.ttf differ diff --git a/docs/rdoc/fonts/Lato-LightItalic.ttf b/docs/rdoc/fonts/Lato-LightItalic.ttf new file mode 100644 index 0000000..7959fef Binary files /dev/null and b/docs/rdoc/fonts/Lato-LightItalic.ttf differ diff --git a/docs/rdoc/fonts/Lato-Regular.ttf b/docs/rdoc/fonts/Lato-Regular.ttf new file mode 100644 index 0000000..839cd58 Binary files /dev/null and b/docs/rdoc/fonts/Lato-Regular.ttf differ diff --git a/docs/rdoc/fonts/Lato-RegularItalic.ttf b/docs/rdoc/fonts/Lato-RegularItalic.ttf new file mode 100644 index 0000000..bababa0 Binary files /dev/null and b/docs/rdoc/fonts/Lato-RegularItalic.ttf differ diff --git a/docs/rdoc/fonts/SourceCodePro-Bold.ttf b/docs/rdoc/fonts/SourceCodePro-Bold.ttf new file mode 100644 index 0000000..61e3090 Binary files /dev/null and b/docs/rdoc/fonts/SourceCodePro-Bold.ttf differ diff --git a/docs/rdoc/fonts/SourceCodePro-Regular.ttf b/docs/rdoc/fonts/SourceCodePro-Regular.ttf new file mode 100644 index 0000000..85686d9 Binary files /dev/null and b/docs/rdoc/fonts/SourceCodePro-Regular.ttf differ diff --git a/docs/rdoc/images/add.png b/docs/rdoc/images/add.png new file mode 100644 index 0000000..6332fef Binary files /dev/null and b/docs/rdoc/images/add.png differ diff --git a/docs/rdoc/images/arrow_up.png b/docs/rdoc/images/arrow_up.png new file mode 100644 index 0000000..1ebb193 Binary files /dev/null and b/docs/rdoc/images/arrow_up.png differ diff --git a/docs/rdoc/images/brick.png b/docs/rdoc/images/brick.png new file mode 100644 index 0000000..7851cf3 Binary files /dev/null and b/docs/rdoc/images/brick.png differ diff --git a/docs/rdoc/images/brick_link.png b/docs/rdoc/images/brick_link.png new file mode 100644 index 0000000..9ebf013 Binary files /dev/null and b/docs/rdoc/images/brick_link.png differ diff --git a/docs/rdoc/images/bug.png b/docs/rdoc/images/bug.png new file mode 100644 index 0000000..2d5fb90 Binary files /dev/null and b/docs/rdoc/images/bug.png differ diff --git a/docs/rdoc/images/bullet_black.png b/docs/rdoc/images/bullet_black.png new file mode 100644 index 0000000..5761970 Binary files /dev/null and b/docs/rdoc/images/bullet_black.png differ diff --git a/docs/rdoc/images/bullet_toggle_minus.png b/docs/rdoc/images/bullet_toggle_minus.png new file mode 100644 index 0000000..b47ce55 Binary files /dev/null and b/docs/rdoc/images/bullet_toggle_minus.png differ diff --git a/docs/rdoc/images/bullet_toggle_plus.png b/docs/rdoc/images/bullet_toggle_plus.png new file mode 100644 index 0000000..9ab4a89 Binary files /dev/null and b/docs/rdoc/images/bullet_toggle_plus.png differ diff --git a/docs/rdoc/images/date.png b/docs/rdoc/images/date.png new file mode 100644 index 0000000..783c833 Binary files /dev/null and b/docs/rdoc/images/date.png differ diff --git a/docs/rdoc/images/delete.png b/docs/rdoc/images/delete.png new file mode 100644 index 0000000..08f2493 Binary files /dev/null and b/docs/rdoc/images/delete.png differ diff --git a/docs/rdoc/images/find.png b/docs/rdoc/images/find.png new file mode 100644 index 0000000..1547479 Binary files /dev/null and b/docs/rdoc/images/find.png differ diff --git a/docs/rdoc/images/loadingAnimation.gif b/docs/rdoc/images/loadingAnimation.gif new file mode 100644 index 0000000..82290f4 Binary files /dev/null and b/docs/rdoc/images/loadingAnimation.gif differ diff --git a/docs/rdoc/images/macFFBgHack.png b/docs/rdoc/images/macFFBgHack.png new file mode 100644 index 0000000..c6473b3 Binary files /dev/null and b/docs/rdoc/images/macFFBgHack.png differ diff --git a/docs/rdoc/images/package.png b/docs/rdoc/images/package.png new file mode 100644 index 0000000..da3c2a2 Binary files /dev/null and b/docs/rdoc/images/package.png differ diff --git a/docs/rdoc/images/page_green.png b/docs/rdoc/images/page_green.png new file mode 100644 index 0000000..de8e003 Binary files /dev/null and b/docs/rdoc/images/page_green.png differ diff --git a/docs/rdoc/images/page_white_text.png b/docs/rdoc/images/page_white_text.png new file mode 100644 index 0000000..813f712 Binary files /dev/null and b/docs/rdoc/images/page_white_text.png differ diff --git a/docs/rdoc/images/page_white_width.png b/docs/rdoc/images/page_white_width.png new file mode 100644 index 0000000..1eb8809 Binary files /dev/null and b/docs/rdoc/images/page_white_width.png differ diff --git a/docs/rdoc/images/plugin.png b/docs/rdoc/images/plugin.png new file mode 100644 index 0000000..6187b15 Binary files /dev/null and b/docs/rdoc/images/plugin.png differ diff --git a/docs/rdoc/images/ruby.png b/docs/rdoc/images/ruby.png new file mode 100644 index 0000000..f763a16 Binary files /dev/null and b/docs/rdoc/images/ruby.png differ diff --git a/docs/rdoc/images/tag_blue.png b/docs/rdoc/images/tag_blue.png new file mode 100644 index 0000000..3f02b5f Binary files /dev/null and b/docs/rdoc/images/tag_blue.png differ diff --git a/docs/rdoc/images/tag_green.png b/docs/rdoc/images/tag_green.png new file mode 100644 index 0000000..83ec984 Binary files /dev/null and b/docs/rdoc/images/tag_green.png differ diff --git a/docs/rdoc/images/transparent.png b/docs/rdoc/images/transparent.png new file mode 100644 index 0000000..d665e17 Binary files /dev/null and b/docs/rdoc/images/transparent.png differ diff --git a/docs/rdoc/images/wrench.png b/docs/rdoc/images/wrench.png new file mode 100644 index 0000000..5c8213f Binary files /dev/null and b/docs/rdoc/images/wrench.png differ diff --git a/docs/rdoc/images/wrench_orange.png b/docs/rdoc/images/wrench_orange.png new file mode 100644 index 0000000..565a933 Binary files /dev/null and b/docs/rdoc/images/wrench_orange.png differ diff --git a/docs/rdoc/images/zoom.png b/docs/rdoc/images/zoom.png new file mode 100644 index 0000000..908612e Binary files /dev/null and b/docs/rdoc/images/zoom.png differ diff --git a/docs/rdoc/index.html b/docs/rdoc/index.html new file mode 100644 index 0000000..15687e8 --- /dev/null +++ b/docs/rdoc/index.html @@ -0,0 +1,149 @@ + + + + + + +Bayes::Bishop Documentation + + + + + + + + + + + + + + + + + +
+ + +

Introduction

+ +

This is a Naive Bayes classifier that can be used to categorize text based +on trained “pools”. Training counts how often each word is used, except for +any specified stop words. The Bayes::Bishop.guess method tokenizes the +message and then calculates for each pool the probability that the message +is the same “classification” as that pool. For example, you could train the +system with one pool of “spam” email and one pool of “non-spam” email. Then +you could ask the guess method which pool each incoming message belongs to.

+ +

Usage

+
  1. +

    Create a Bishop::Bayes object:

    + +
    b = Bishop::Bayes.new
    +
  2. +

    Train with multiple pools of text:

    + +
    b.train('pool1')  
    +b.train('pool2')  
    +b.train('pool3')
    +
  3. +

    Call the guess method with a message to categorize:

    + +
    guesses = b.guess('This is a sentence')
    +
+ +

The return value is a hash where the keys are pool names and the values are +the probability that the message belongs to that pool.

+ +

Features

+
  • +

    Stop words may be specified

    + +
    b.add_stop_words(an_array_words)  
    +b.add_stop_word('word')
    +
  • +

    You can include the default stop words list

    + +
    b.load_default_stop_words
    +
  • +

    You can choose between the default tokenizer, a stemming tokenizer, or a +custom tokenizer

    + +
    b = Bishop::Bayes.new
    +b = Bishop::Bayes.new(Bishop::StemmingTokenizer)
    +b = Bishop::Bayes.new(CustomTokenizer)
    +
+
+ + + + + diff --git a/docs/rdoc/js/darkfish.js b/docs/rdoc/js/darkfish.js new file mode 100644 index 0000000..06fef3b --- /dev/null +++ b/docs/rdoc/js/darkfish.js @@ -0,0 +1,140 @@ +/** + * + * Darkfish Page Functions + * $Id: darkfish.js 53 2009-01-07 02:52:03Z deveiant $ + * + * Author: Michael Granger + * + */ + +/* Provide console simulation for firebug-less environments */ +if (!("console" in window) || !("firebug" in console)) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", "dirxml", + "group", "groupEnd", "time", "timeEnd", "count", "trace", "profile", "profileEnd"]; + + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +}; + + +/** + * Unwrap the first element that matches the given @expr@ from the targets and return them. + */ +$.fn.unwrap = function( expr ) { + return this.each( function() { + $(this).parents( expr ).eq( 0 ).after( this ).remove(); + }); +}; + + +function showSource( e ) { + var target = e.target; + var codeSections = $(target). + parents('.method-detail'). + find('.method-source-code'); + + $(target). + parents('.method-detail'). + find('.method-source-code'). + slideToggle(); +}; + +function hookSourceViews() { + $('.method-heading').click( showSource ); +}; + +function toggleDebuggingSection() { + $('.debugging-section').slideToggle(); +}; + +function hookDebuggingToggle() { + $('#debugging-toggle img').click( toggleDebuggingSection ); +}; + +function hookSearch() { + var input = $('#search-field').eq(0); + var result = $('#search-results').eq(0); + $(result).show(); + + var search_section = $('#search-section').get(0); + $(search_section).show(); + + var search = new Search(search_data, input, result); + + search.renderItem = function(result) { + var li = document.createElement('li'); + var html = ''; + + // TODO add relative path to + + + + + + + + + + +
+

Table of Contents - Bayes::Bishop Documentation

+ +

Pages

+ + +

Classes and Modules

+ + +

Methods

+ +
+ + + + diff --git a/lib/bayes/bishop.rb b/lib/bayes/bishop.rb index 320b5ad..b6a7381 100755 --- a/lib/bayes/bishop.rb +++ b/lib/bayes/bishop.rb @@ -1,95 +1,203 @@ +# This is a Naive Bayes classifier that can be used to categorize text based on trained "pools". +# +# Copyright 2014, Maymount Enterprises, Ltd. # -# This module is a port to the Ruby language of the Reverend Bayesian classifier distributed -# as part of the Divmod project (which is Copyright 2003 Amir Bakhtiar +# It is a port to the Ruby language of the Divmod project (which is Copyright 2003 Amir Bakhtiar +# and based on the Ruby port, Copyright 2005 by Matt Mower # -# This Ruby port is Copyright 2005 Matt Mower and is free software; -# you can distribute it and/or modify it under the terms of version 2.1 of the GNU -# Lesser General Public License as published by the Free Software Foundation. +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. # +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser Public License for more details. +# +# You should have received a copy of the GNU Lesser Public License +# along with this program. If not, see . +# + require 'yaml' require 'stemmer' +require 'json' module Bishop - # - # As at v1.8 Ruby's YAML persists Hashes using special processing rather - # than by dumping it's instance variables, hence no instance variables - # in a Hash subclass get dumped either. - # - class BayesData - - attr_accessor :token_count, :train_count, :name - attr_reader :training, :data + class BayesPool #:nodoc: + include Enumerable + + # Sum of token counts in the pool + attr_reader :token_count + + # Number of times train has been called for this pool + attr_accessor :train_count + + # Hash that contains counts for all tokens + attr_reader :data - def initialize( name = '', pool = nil ) - @name = name - @training = [] - @pool = pool + def initialize @data = Hash.new( 0.0 ) - self.token_count = 0 - self.train_count = 0 + @token_count = 0 + @train_count = 0 end - def trained_on?( item ) - self.training.include? item + # Iterate through the tokens in the pool + def each + @data.each end def to_s - "" + "" + end + + # Convert the pool into an array of the format [['token1',count1],['token2',count2],...] + def to_a + @data.to_a end + # Return all of the tokens in the pool + def tokens + @data.keys.sort + end + + # Return the number of tokens in the pool + def num_tokens + @data.length + end + + # Add a token to the pool, incrementing its count value, and updating the token_value + def add_token token, count = 1 + if @data.has_key?(token) + @data[token] = @data[token] + count + else + @data[token] = count + end + + @token_count = @token_count + count + end + + # Set the count for a the specified token + def []= token, count + add_token(token,count) + end + + # Get the count for the specified token + def [] token + @data[token] + end + + # Merge another pool into the current pool + def merge(other_pool) + other_pool.data.each { |token,count| add_token(token,count) } + end + + # Decrement the token count and remove the token if the count is 0 + def remove_token token, count = 1 + @data[token] -= count + @data.delete(token) if @data[token] < 1 + @token_count = @token_count - count + end end # A tokenizer class which splits words removing non word characters except hyphens. class SimpleTokenizer - def tokenize( item, stop_words ) + def tokenize( item, stop_words=[] ) item.split( /\s+/ ).map do |i| - token = i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" ) + i.split( /\-/ ).map { |token| token.downcase.gsub( /\W/, "" ) }.join( "-" ) end.reject { |t| t == "" || t == "-" || stop_words.detect { |w| w == t } } end end # A tokenizer which, having split words, reduces them to porter stemmed tokens class StemmingTokenizer < SimpleTokenizer - def tokenize( item, stop_words ) + def tokenize( item, stop_words=[] ) super( item, stop_words ).map { |word| word.stem } end end class Bayes - attr_accessor :dirty, :train_count, :pools, :tokenizer, :data_class, :corpus, :cache, :combiner - attr_reader :data_class, :stop_words + # instance of Tokenizer that handles tokenization + attr_accessor :tokenizer + + # Block called to combine probabilities. Set to Bishop.robinson by default + attr_accessor :combiner + + # An array containing stop words that the tokenizer will ignore + attr_reader :stop_words + + @dirty = true # set to true for any changes, false when pool_probs is called + @corpus_data = nil # __Corpus__ pool, created when corpus method called + @cache = nil # hash of BayesPool objects that contain probabilities instead of counts - def initialize( tokenizer = SimpleTokenizer, data_class = BayesData, &combiner ) + # tokenizer is the name of the class that will separate the input into tokens. + # See SimpleTokenizer and StemmingTokenizer for more information. + # Combiner defaults to a block that calls Bishop.robinson + def initialize( tokenizer = SimpleTokenizer, &combiner ) @tokenizer = tokenizer.new @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) } - @data_class = data_class - @pools = {} - @corpus = new_pool( '__Corpus__' ) - @pools['__Corpus__'] = @corpus - @train_count = 0 - @dirty = true + @pools = {} # hash, key = pool name, value = BayesPool class + @cache = {} # created by calling build_cache, contains raw probabilities + @corpus_data = nil # created when corpus method is called, contains token totals + @dirty = true # indicates that cache and corpus_data are invalid @stop_words = [] end + + + # + # == POOLS + # + + # Get the pool specified by name + def pool pool_name + @pools[pool_name] + end - def commit - self.save + # Get a list of pools + def pool_names + @pools.keys.sort + end + + # Create a new, empty, pool without training. + def new_pool( pool_name ) + @dirty = true + @pools[ pool_name ] ||= BayesPool.new end - def dirty? - self.dirty + # Remove the given pool + def remove_pool( pool_name ) + @dirty = true + @pools.delete( pool_name ) + end + + # Rename the given pool + def rename_pool( pool_name, new_name ) + @pools[new_name] = @pools[pool_name] + @pools.delete( pool_name ) + @dirty = true + end + + # Merge the contents of the source pool into the destination destination pool. + def merge_pools( dest_name, source_name ) + @pools[dest_name].merge(@pools[source_name]) + @dirty = true end - # Add each of the specified stop words + # + # == STOP WORDS + # + + # Add an array of stop words def add_stop_words( words ) - words.each { |word| add_stop_word word } + words.each { |word| add_stop_word word if !word.empty? } end # Add the specified stop word def add_stop_word( word ) - @stop_words << word unless @stop_words.include? word + @stop_words << word.downcase if !@stop_words.include?(word.downcase) end # Load stopwords from the specified YAML formatted file @@ -102,63 +210,52 @@ def load_default_stop_words load_stop_words( File.join( File.dirname( __FILE__ ), 'stopwords.yml' ) ) end - # Create a new, empty, pool without training. - def new_pool( pool_name ) - self.dirty = true - self.pools[ pool_name ] ||= @data_class.new( pool_name ) - end - - def remove_pool( pool_name ) - self.pools.delete( pool_name ) - end - - def rename_pool( pool_name, new_name ) - self.pools[new_name] = self.pools[pool_name] - self.pools[new_name].name = new_name - self.pools.delete( pool_name ) - self.dirty = true - end + # + # EXPORT & IMPORT STATE + # - # Merge the contents of the source pool into the destination - # destination pool. - def merge_pools( dest_name, source_name ) - dest_pool = self.pools[dest_name] - self.pools[source_name].data.each do |token,count| - if dest_pool.data.has_key?( token ) - dest_pool.data[token] += count - else - dest_pool.data[token] = count - dest_pool.token_count += 1 + # Get a hash that represents the current state, excluding tokenizer and combiner + def export + h = { + :stop_words => @stop_words.join(',') + } + pools = {} + @pools.each do |pool_name,pool| + data = pool.data + sorted_data = data.sort do |a,b| + if a[1] == b[1] + a[0] <=> b[0] + else + a[1] <=> b[1] + end end + pools[pool_name] = { + :token_count => pool.token_count, + :train_count => pool.train_count, + :data => sorted_data.to_h + } end - self.dirty = true + h[:pools] = pools + h end - - # Return an array of token counts for the specified pool. - def pool_data( pool_name ) - self.pools[pool_name].data.to_a - end - - # Return an array of tokens trained in the specified pool. - def pool_tokens( pool_name ) - self.pools[pool_name].data.keys + + # Gets the current state in YAML format + def to_yaml + export.to_yaml end - # Create a representation of the state of the classifier which can - # be reloaded later. This does not include the tokenizer, data class, - # or combiner functions which must be reinitialized each time the - # classifier is created. - def save( file = 'bayesdata.yml' ) - File.open( file, 'w' ) { |f| f << export } + # Gets the current state in JSON format + def to_json + JSON.pretty_generate(export) end - # Define the YAML representation of the state of the classifier (possibly this - # should just be an override of the to_yaml method generated by the YAML module). - def export - { :pools => self.pools, :train_count => self.train_count, :stop_words => self.stop_words }.to_yaml + # Save the current state to a YAML file, default = 'bayesdata.yml' + def save_yaml( file = 'bayesdata.yml' ) + File.open( file, 'w' ) { |f| f << to_yaml } end - - def load( file = 'bayesdata.yml' ) + + # Load the current state from a YAML file, default = 'bayesdata.yml' + def load_yaml( file = 'bayesdata.yml' ) begin File.open( file ) { |f| load_data( f ) } rescue Errno::ENOENT @@ -166,50 +263,132 @@ def load( file = 'bayesdata.yml' ) end end + # + # TRAIN & GUESS + # + + # Train the specified pool with the given input. + # If the input is a string it is passed through the configured Tokenizer. + # Otherwise, if it is an array it is just added. + def train( pool_name, input ) + tokens = input.is_a?(String) ? get_tokens( input ) : input + pool = new_pool( pool_name ) + train_( pool, tokens ) + pool.train_count += 1 + @dirty = true + end + + # Remove the input from the given pool + # If the input is a string it is passed through the configured Tokenizer. + # Otherwise, if it is an array it is just added. + def untrain( pool_name, input ) + pool = find_pool( pool_name ) + return if !pool + tokens = input.is_a?(String) ? get_tokens( input ) : input + untrain_( pool, tokens ) + pool.train_count -= 1 + @dirty = true + end + + # Returns true if the specified token has been trained for any pool + def trained_on?( token ) + build_cache if @dirty + @cache.values.any? { |v| v.trained_on? token } + end + + # Call this method to classify a "message". The return value will be + # a hash, with the pool name is the key and the probability is the value, for each pool which + # is a likely match for the message. + def guess( msg ) + tokens = get_tokens( msg ) + res = {} + + build_cache if dirty? + + @cache.each do |pool_name,pool| + p = get_probs( pool, tokens ) + if p.length != 0 + res[pool_name] = @combiner.call( p, pool_name ) + end + end + + h = Hash.new + res.sort.each { |a| h[a[0]] = a[1] } + h + end + + # + # Private Methods + # + def load_data( source ) data = YAML.load( source ) + data[:pools].each do |pool_name,pool_data| + pool = new_pool(pool_name) + pool.train_count = pool_data[:train_count] + pool_data[:data].each do |token,value| + pool.add_token(token,value) + end + end + + add_stop_words(data[:stop_words].split(',')) + + @dirty = true + end + + def dirty? # TODO Make private? + @dirty + end + + def train_( pool, tokens ) + tokens.each { |token| pool.add_token(token) } + end + + def untrain_( pool, tokens ) + tokens.each do |token| + pool.remove_token(token) + end + end + + def corpus + return @corpus_data if @corpus_data - @pools = data[:pools] - @pools.each { |pool_name,pool| pool.data.default = 0.0 } - @corpus = self.pools['__Corpus__'] + @corpus_data = BayesPool.new - @train_count = data[:train_count] - @stop_words = data[:stop_words] + @pools.each do |pool_name, pool| + @corpus_data.merge(pool) + end - self.dirty = true + @corpus_data end - def pool_names - self.pools.keys.sort.reject { |name| name == '__Corpus__' } - end - # Create a cache of the metrics for each pool. def build_cache - self.cache = {} + @cache = {} - self.pools.each do |name,pool| - unless name == '__Corpus__' - - pool_count = pool.token_count - them_count = [ 1, self.corpus.token_count - pool_count ].max - cache_dict = self.cache[ name ] ||= @data_class.new( name ) + return @cache if corpus.token_count == 0.0 + + @pools.each do |pool_name,pool| + if pool.token_count > 0 + cache_dict = @cache[ pool_name ] ||= BayesPool.new - self.corpus.data.each do |token,tot_count| - this_count = pool.data[token] + them_count = [ 1, corpus.token_count - pool.token_count ].max # tokens in other pools - unless this_count == 0.0 - other_count = tot_count - this_count + corpus.data.each do |token,corpus_count| + if pool.data.has_key?(token) - if pool_count > 0 - good_metric = [ 1.0, other_count / pool_count ].min - else - good_metric = 1.0 - end + # number of references in other pools + other_count = corpus_count - pool.data[token] + + # prob token is not in this pool + good_metric = [ 1.0, Float(other_count) / Float(pool.token_count) ].min - bad_metric = [ 1.0, this_count / them_count ].min + # prob token is in a different pool + # NOTE Must explicitly cast to Floats or else it does integration division, and the result is zero + bad_metric = [ 1.0, Float(pool.data[token]) / Float(them_count) ].min f = bad_metric / ( good_metric + bad_metric ) - + if ( f - 0.5 ).abs >= 0.1 cache_dict.data[token] = [ 0.0001, [ 0.9999, f ].min ].max end @@ -217,21 +396,13 @@ def build_cache end end end + @dirty = false + @cache end - - # Get the probabilities for each pool, recreating the cached information if - # any token information for any of the pools has changed. - def pool_probs - if self.dirty? - self.build_cache - self.dirty = false - end - self.cache - end # Create a token array from the specified input. def get_tokens( input ) - self.tokenizer.tokenize( input, self.stop_words ) + @tokenizer.tokenize( input, @stop_words ) end # For each word trained in the pool, collect it's occurrence data in the pool into a sorted array. @@ -239,87 +410,12 @@ def get_probs( pool, words ) words.find_all { |word| pool.data.has_key? word }.map { |word| [word,pool.data[word]] }.sort end - def train( pool_name, item, uid = nil ) - tokens = get_tokens( item ) - pool = new_pool( pool_name ) - train_( pool, tokens ) - self.corpus.train_count += 1 - pool.train_count += 1 - if uid - pool.training.push( uid ) - end - self.dirty = true - end - - def train_( pool, tokens ) - wc = 0 - tokens.each do |token| - pool.data[token] += 1 - self.corpus.data[token] += 1 - wc += 1 - end - pool.token_count += wc - self.corpus.token_count += wc - end - - def untrain( pool_name, item, uid = nil ) - tokens = get_tokens( item ) - pool = new_pool( pool_name ) - untrain_( pool, tokens ) - self.corpus.train_count += 1 - pool.train_count += 1 - if uid - pool.training.delete( uid ) - end - self.dirty = true - end - - def untrain_( pool, tokens ) - tokens.each do |token| - if pool.data.has_key? token - if pool.data[token] == 1 - pool.data.delete( token ) - else - pool.data[token] -= 1 - end - pool.token_count -= 1 - end - - if self.corpus.data.has_key? token - if self.corpus.data[token] == 1 - self.corpus.data.delete( token ) - else - self.corpus.data[token] -= 1 - end - self.corpus.token_count -= 1 - end - end - end - - def trained_on?( msg ) - self.cache.values.any? { |v| v.trained_on? msg } - end - - # Call this method to classify a "message". The return value will be - # an array containing tuples (pool, probability) for each pool which - # is a likely match for the message. - def guess( msg ) - tokens = get_tokens( msg ) - res = {} - - pool_probs.each do |pool_name,pool| - p = get_probs( pool, tokens ) - if p.length != 0 - res[pool_name] = self.combiner.call( p, pool_name ) - end - end - - res.sort - end + private :train_, :untrain_, :get_probs, :corpus, :build_cache, :dirty?, :get_tokens, :get_probs, :export, :load_data - private :train_, :untrain_ end + # default "combiner" set in initialize + # ignore is truly ignored def self.robinson( probs, ignore ) nth = 1.0/probs.length what_is_p = 1.0 - probs.map { |p| 1.0 - p[1] }.inject( 1.0 ) { |s,v| s * v } ** nth @@ -328,6 +424,7 @@ def self.robinson( probs, ignore ) ( 1 + what_is_s ) / 2 end + # Alternative combiner def self.robinson_fisher( probs, ignore ) n = probs.length @@ -346,7 +443,7 @@ def self.robinson_fisher( probs, ignore ) ( 1 + h - s ) / 2 end - def self.chi2p( chi, df ) + def self.chi2p( chi, df ) #:nodoc: m = chi / 2 sum = term = Math.exp( -m ) (1 .. df/2).each do |i| @@ -355,5 +452,4 @@ def self.chi2p( chi, df ) end [1.0, sum].min end - end \ No newline at end of file diff --git a/test/test_bayes.rb b/test/test_bayes.rb new file mode 100644 index 0000000..4d71efd --- /dev/null +++ b/test/test_bayes.rb @@ -0,0 +1,206 @@ +require 'rubygems' +require 'bundler' +require 'minitest' +require 'minitest/autorun' +Bundler.require(:default, :test) + +require_relative '../lib/bayes/bishop' + +class TestBayes < Minitest::Test + parallelize_me! + + LINCOLN1 = "Four score and seven years ago our fathers brought forth on this continent,"+ + " a new nation, conceived in Liberty, and dedicated to the proposition that all"+ + " men are created equal." + LINCOLN2 ="Now we are engaged in a great civil war, testing whether that nation, "+ + "or any nation so conceived and so dedicated, can long endure. We are met on"+ + " a great battle-field of that war. We have come to dedicate a portion of that"+ + " field, as a final resting place for those who here gave their lives that that"+ + " nation might live. It is altogether fitting and proper that we should do this." + LINCOLN3 = "But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow --"+ + " this ground. " + LINCOLN4 = "The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. " + + JABBER1 = "Beware the Jabberwock, my son!"+ + " The jaws that bite, the claws that catch!"+ + " Beware the Jubjub bird, and shun The frumious Bandersnatch!" + JABBER2 = "He took his vorpal sword in hand:"+ + " Long time the manxome foe he sought -- " + + " So rested he by the Tumtum tree, " + + " And stood awhile in thought. " + JABBER3 = "And, as in uffish thought he stood, " + + " The Jabberwock, with eyes of flame, " + + " Came whiffling through the tulgey wood, " + + " And burbled as it came!" + JABBER4 = "One, two! One, two! And through and through" + + " The vorpal blade went snicker-snack! " + + " He left it dead, and with its head " + + " He went galumphing back." + + ROMEO = "Two households, both alike in dignity, "+ + "In fair Verona, where we lay our scene, "+ + "From ancient grudge break to new mutiny, " + + def test_bayes_initializer + b = Bishop::Bayes.new + assert_instance_of Bishop::SimpleTokenizer, b.tokenizer + refute_nil b.combiner + assert_equal 0, b.pool_names.length + assert_equal 0, b.stop_words.length + end + + def test_add_stop_words + b = Bishop::Bayes.new + sw = %w{ Alpha bEta gammA } + assert_equal 0, b.stop_words.length + b.add_stop_words( sw ) + assert_equal 3, b.stop_words.length + assert_equal sw.map {|s| s.downcase }, b.stop_words + end + + def test_duplicate_stop_words + b = Bishop::Bayes.new + sw = %w{ Alpha bEta gammA delta epsilon alpha betA omega } + sw2 = sw.map { |s| s.downcase }.uniq.sort + assert_equal 0, b.stop_words.length + b.add_stop_words( sw ) + assert_equal sw2.length, b.stop_words.length + assert_equal sw2, b.stop_words.sort + end + + def test_default_stop_words + b = Bishop::Bayes.new + assert_equal 0, b.stop_words.length + b.load_default_stop_words + refute_equal 0, b.stop_words.length + end + + def test_new_pool + b = Bishop::Bayes.new + p = b.new_pool('testing') + refute_nil b.pool('testing') + assert_equal b.pool('testing'),p + + b.remove_pool('testing') + assert_nil b.pool('testing') + end + + def test_rename_pool + b = Bishop::Bayes.new + p = b.new_pool('testing') + refute_nil b.pool('testing') + b.rename_pool('testing','gnitset') + assert_nil b.pool('testing') + refute_nil b.pool('gnitset') + end + + def test_pool_names + b = Bishop::Bayes.new + names1 = %w{ gamma alpha beta} + names1.each { |n| b.new_pool(n) } + assert_equal names1.sort, b.pool_names + end + + def test_train_simple + b = Bishop::Bayes.new + + b.load_default_stop_words + + b.train('lincoln', LINCOLN1) + b.train('lincoln', LINCOLN2) + b.train('lincoln', LINCOLN3) + + b.train('jabber', JABBER1) + b.train('jabber', JABBER2) + b.train('jabber', JABBER3) + + guess_lincoln = b.guess(LINCOLN4) + + assert_kind_of Hash, guess_lincoln + + guess_jabber = b.guess(JABBER4) + + guess_romeo = b.guess(ROMEO) + + assert guess_lincoln.has_key?('lincoln') + assert guess_jabber.has_key?('jabber') + refute guess_romeo.has_key?('lincoln') + refute guess_romeo.has_key?('jabber') + + assert guess_lincoln['lincoln'] > 0.9 + assert guess_jabber['jabber'] > 0.9 + end + + def test_train_array + b = Bishop::Bayes.new + t = Bishop::SimpleTokenizer.new + + b.train('a', t.tokenize(LINCOLN1)) + b.train('a', t.tokenize(LINCOLN2)) + b.train('a', t.tokenize(LINCOLN3)) + + b.train('b', LINCOLN1) + b.train('b', LINCOLN2) + b.train('b', LINCOLN3) + + assert_equal b.pool('a').data, b.pool('b').data + end + + def test_pool_merge + b = Bishop::Bayes.new + + b.load_default_stop_words + + b.train('lincoln', LINCOLN1) + b.train('lincoln', LINCOLN2) + b.train('lincoln', LINCOLN3) + + b.train('jabber', JABBER1) + b.train('jabber', JABBER2) + b.train('jabber', JABBER3) + + guess = b.guess(LINCOLN4) + + assert guess.has_key?('lincoln') + refute guess.has_key?('jabber') + + b.merge_pools('jabber','lincoln') + + guess = b.guess(LINCOLN4) + + assert guess.has_key?('jabber') + assert guess.has_key?('lincoln') + + + end + + def test_to_json + b = Bishop::Bayes.new + + + b.load_default_stop_words + + b.train('lincoln', LINCOLN1) + b.train('lincoln', LINCOLN2) + b.train('lincoln', LINCOLN3) + + b.train('jabber', JABBER1) + b.train('jabber', JABBER2) + b.train('jabber', JABBER3) + + b.train('romeo',ROMEO) + + j = JSON.parse(b.to_json) + + assert j.has_key?('stop_words') + assert j.has_key?('pools') + train_counts = { 'lincoln' => 3, 'jabber' => 3, 'romeo' => 1} + ['lincoln','jabber','romeo'].each do |p| + assert j['pools'].has_key?(p) + assert train_counts[p], j['pools'][p]['train_count'] + assert_equal j['pools'][p]['token_count'],j['pools'][p]['data'].inject(0) { |sum,n| sum + n[1] } + end + end + + +end \ No newline at end of file diff --git a/test/test_bayes_pool.rb b/test/test_bayes_pool.rb new file mode 100644 index 0000000..0a0a17c --- /dev/null +++ b/test/test_bayes_pool.rb @@ -0,0 +1,67 @@ +require 'rubygems' +require 'bundler' +require 'minitest' +require 'minitest/autorun' +Bundler.require(:default, :test) + +require_relative '../lib/bayes/bishop' + +class TestBayesPool < Minitest::Test + parallelize_me! + + LINCOLN1 = "Four score and seven years ago our fathers brought forth on this continent,"+ + " a new nation, conceived in Liberty, and dedicated to the proposition that all"+ + " men are created equal." + + def test_creation_simple + bp = Bishop::BayesPool.new + refute_nil bp + refute_nil bp.data + assert_equal 0, bp.token_count + assert_equal 0, bp.train_count + assert_equal "", bp.to_s + end + + def test_merge + bp = Bishop::BayesPool.new + end + + def test_indexing + b = Bishop::Bayes.new + + b.train('lincoln', LINCOLN1) + + pool = b.pool('lincoln') + + pool.tokens.each do |token| + assert_equal pool.data[token], pool[token] + end + end + + def test_index_set + b = Bishop::Bayes.new + + pool = b.new_pool('simple') + + (1..10).each { |i| pool["token#{i}"] = i} + + (1..10).each do |i| + assert_equal i, pool["token#{i}"] + end + end + + def test_enumerable + b = Bishop::Bayes.new + + b.train('lincoln', LINCOLN1) + + pool = b.pool('lincoln') + + pool.each do |k,v| + assert_equal pool.data[k],v + assert_equal pool[k],v + end + end + + +end \ No newline at end of file diff --git a/test/test_tokenizers.rb b/test/test_tokenizers.rb new file mode 100644 index 0000000..f07a9fe --- /dev/null +++ b/test/test_tokenizers.rb @@ -0,0 +1,33 @@ +require 'rubygems' +require 'bundler' +require 'minitest' +require 'minitest/autorun' +Bundler.require(:default, :test) + +require_relative '../lib/bayes/bishop' + +class TestTokenizers < Minitest::Test + parallelize_me! + + def test_simple_tokenizer + tokenizer = Bishop::SimpleTokenizer.new + s1 = ' this " :-) ;.; % $ &*# is a hyPhen-Test to see - what happens -- ' + r1 = ["this", "is", "a", "hyphen-test", "to", "see", "what", "happens"] + assert_equal r1, tokenizer.tokenize(s1) + end + + def test_simple_tokenizer_stop_words + tokenizer = Bishop::SimpleTokenizer.new + s1 = ' alpha beta delta gamma omega phi psi tau ' + r1 = %w( alpha beta gamma omega psi tau ) + assert_equal r1, tokenizer.tokenize(s1,%w(delta phi)) + end + + def test_stemming_tokenizer + tokenizer = Bishop::StemmingTokenizer.new + s1 = ' thankfulness liveliness socializer socialism ' + r1 = %w( thank liveli social social ) + tokens = tokenizer.tokenize(s1) + assert_equal r1, tokens + end +end \ No newline at end of file diff --git a/test/test_yaml.rb b/test/test_yaml.rb new file mode 100644 index 0000000..8333f84 --- /dev/null +++ b/test/test_yaml.rb @@ -0,0 +1,40 @@ +require 'rubygems' +require 'bundler' +require 'minitest' +require 'minitest/autorun' +Bundler.require(:default, :test) + +require_relative '../lib/bayes/bishop' + +class TestYaml < Minitest::Test + parallelize_me! + + LINCOLN1 = "Four score and seven years ago our fathers brought forth on this continent,"+ + " a new nation, conceived in Liberty, and dedicated to the proposition that all"+ + " men are created equal." + + + ROMEO = "Two households, both alike in dignity, "+ + "In fair Verona, where we lay our scene, "+ + "From ancient grudge break to new mutiny, " + + def test_yaml + b1 = Bishop::Bayes.new + b1.add_stop_words(ROMEO.split(/[^\w]+/)) + b1.train('lincoln',LINCOLN1) + b1.save_yaml + + b2 = Bishop::Bayes.new + b2.load_yaml + + assert_equal b1.stop_words, b2.stop_words + + b1_pool = b1.pool('lincoln') + b2_pool = b2.pool('lincoln') + assert_equal b1_pool.train_count, b2_pool.train_count + assert_equal b1_pool.token_count, b2_pool.token_count + assert_equal b1_pool.data, b2_pool.data + end + + +end \ No newline at end of file