diff --git a/ArunavD/nltk_test/.ipynb_checkpoints/nltk_run.py-checkpoint.ipynb b/ArunavD/nltk_test/.ipynb_checkpoints/nltk_run.py-checkpoint.ipynb new file mode 100644 index 0000000..6042edd --- /dev/null +++ b/ArunavD/nltk_test/.ipynb_checkpoints/nltk_run.py-checkpoint.ipynb @@ -0,0 +1,825 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Create dictionary from each words\n", + "\n", + "\n", + "from bs4 import BeautifulSoup\n", + "import urllib.request\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Requesting the webpage\n", + "\n", + "\n", + "response = urllib.request.urlopen('http://php.net/')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading contents of the page\n", + "\n", + "\n", + "html = response.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic syntaxTypesVariablesConstantsExpressionsOperatorsControl StructuresFunctionsClasses and ObjectsNamespacesErrorsExceptionsGeneratorsReferences ExplainedPredefined VariablesPredefined ExceptionsPredefined Interfaces and ClassesContext options and parametersSupported Protocols and WrappersSecurityIntroductionGeneral considerationsInstalled as CGI binaryInstalled as an Apache moduleSession SecurityFilesystem SecurityDatabase SecurityError ReportingUsing Register GlobalsUser Submitted DataMagic QuotesHiding PHPKeeping CurrentFeaturesHTTP authentication with PHPCookiesSessionsDealing with XFormsHandling file uploadsUsing remote filesConnection handlingPersistent Database ConnectionsSafe ModeCommand line usageGarbage CollectionDTrace Dynamic TracingFunction ReferenceAffecting PHP's BehaviourAudio Formats ManipulationAuthentication ServicesCommand Line Specific ExtensionsCompression and Archive ExtensionsCredit Card ProcessingCryptography ExtensionsDatabase ExtensionsDate and Time Related ExtensionsFile System Related ExtensionsHuman Language and Character Encoding SupportImage Processing and GenerationMail Related ExtensionsMathematical ExtensionsNon-Text MIME OutputProcess Control ExtensionsOther Basic ExtensionsOther ServicesSearch Engine ExtensionsServer Specific ExtensionsSession ExtensionsText ProcessingVariable and Type Related ExtensionsWeb ServicesWindows Only ExtensionsXML ManipulationGUI ExtensionsKeyboard Shortcuts?This helpjNext menu itemkPrevious menu itemg pPrevious man pageg nNext man pageGScroll to bottomg gScroll to topg hGoto homepageg sGoto search(current page)/Focus search boxPHP is a popular general-purpose scripting language that is especially suited to web development.Fast, flexible and pragmatic, PHP powers everything from your blog to the most popular websites in the world.Download5.6.36·Release Notes·Upgrading7.0.30·Release Notes·Upgrading7.1.18·Release Notes·Upgrading7.2.7·Release Notes·Upgrading21 Jun 2018PHP 7.2.7 ReleasedPHP 7.2.7 Release AnnouncementThe PHP development team announces the immediate availability of PHP 7.2.7.\n", + " This is a primarily a bugfix release which includes a segfault fix for opcache.PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.7 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.21 Jun 2018PHP 7.3.0 alpha 2 ReleasedThe PHP team is glad to announce the release of the second PHP 7.3.0 version, PHP 7.3.0 Alpha 2. \n", + " The rough outline of the PHP 7.3 release cycle is specified in thePHP Wiki.For source downloads of PHP 7.3.0 Alpha 2 please visit thedownload page.\n", + " Windows sources and binaries can be found onwindows.php.net/qa/.Please carefully test this version and report any issues found in thebug reporting system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.The next release would be Alpha 3, planned for July 5.The signatures for the release can be found inthe manifestor onthe QA site.Thank you for helping us make PHP better.07 Jun 2018PHP 7.3.0 alpha 1 ReleasedPHP team is glad to announce the release of the first PHP 7.3.0 version, PHP 7.3.0 Alpha 1. \n", + "\t This starts the PHP 7.3 release cycle, the rough outline of which is specified in thePHP Wiki.For source downloads of PHP 7.3.0 Alpha 1 please visit thedownload page.Please carefully test this version and report any issues found in thebug reporting system.Please DO NOT use this version in production, it is an early test version.For more information on the new features and other changes, you can read theNEWSfile,\n", + "\t or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.The next release would be Alpha 2, planned for June 21.The signatures for the release can be found inthe manifestor onthe QA site.Thank you for helping us make PHP better.25 May 2018PHP 7.1.18 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.1.18.\n", + " \n", + " All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.18 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.24 May 2018PHP 7.2.6 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.6.\n", + " This is a primarily a bugfix release which includes a memory corruption fix for EXIF.PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.6 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 5.6.36 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 5.6.36. This is a security release. Several security bugs have been fixed\n", + " in this release.\n", + "\n", + " All PHP 5.6 users are encouraged to upgrade to this version.For source downloads of PHP 5.6.36 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 7.1.17 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.1.17. This is a security fix release, containing many bugfixes.\n", + " \n", + " All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.17 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 7.0.30 ReleasedThe PHP development team announces the immediate availability of PHP\n", + "\t7.0.30. This is a security release. Several security bugs have been fixed\n", + "\tin this release.\n", + " \n", + " All PHP 7.0 users are encouraged to upgrade to this version.For source downloads of PHP 7.0.30 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.01 Feb 2018PHP 7.2.2 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.2.2. This is a bugfix release, with several bug fixes included.All PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.2 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.12 Oct 2017PHP 7.2.0 Release Candidate 4 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC4.\n", + " This release is the fourth Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 4 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 26th of October.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.28 Sep 2017PHP 7.2.0 Release Candidate 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC3.\n", + " This release is the third Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 12th of October.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.31 Aug 2017PHP 7.2.0 Release Candidate 1 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Release\n", + " Candidate 1. This release is the first Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 1 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The second Release Candidate will be released on the 14th of September.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.17 Aug 2017PHP 7.2.0 Beta 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Beta 3.\n", + " This release is the third and final beta for 7.2.0. All users of PHP are encouraged\n", + " to test this version carefully, and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Beta 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The first Release Candidate will be released on the 31th of August.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.06 Jul 2017PHP 7.2.0 Alpha 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Alpha 3.\n", + " This release contains fixes and improvements relative to Alpha 2.\n", + " All users of PHP are encouraged to test this version carefully,\n", + " and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For information on new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile\n", + " for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Alpha 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found onwindows.php.net/qa/.The first beta will be released on the 20th of July. You can also read the full list of planned releases on ourwiki.Thank you for helping us make PHP better.Older News EntriesConferences calling for papersphp[world] 2018Upcoming conferencesLaravelConf Taiwan 2018Mid-Atlantic Developer ConferenceInternational PHP Conference 2018 - Fall EditionCoderCruise 2018 - The Bahamas!User Group EventsSpecial ThanksSocial media@official_phpCopyright © 2001-2018 The PHP GroupMy PHP.netContactOther PHP.net sitesMirror sitesPrivacy policy\n" + ] + } + ], + "source": [ + "# Parsing the HTML Content\n", + "\n", + "\n", + "soup = BeautifulSoup(html,\"html5lib\")\n", + "text = soup.get_text(strip=True)\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic', 'syntaxTypesVariablesConstantsExpressionsOperatorsControl', 'StructuresFunctionsClasses', 'and', 'ObjectsNamespacesErrorsExceptionsGeneratorsReferences', 'ExplainedPredefined', 'VariablesPredefined', 'ExceptionsPredefined', 'Interfaces', 'and', 'ClassesContext', 'options', 'and', 'parametersSupported', 'Protocols', 'and', 'WrappersSecurityIntroductionGeneral', 'considerationsInstalled', 'as', 'CGI', 'binaryInstalled', 'as', 'an', 'Apache', 'moduleSession', 'SecurityFilesystem', 'SecurityDatabase', 'SecurityError', 'ReportingUsing', 'Register', 'GlobalsUser', 'Submitted', 'DataMagic', 'QuotesHiding', 'PHPKeeping', 'CurrentFeaturesHTTP', 'authentication', 'with', 'PHPCookiesSessionsDealing', 'with', 'XFormsHandling', 'file', 'uploadsUsing', 'remote', 'filesConnection', 'handlingPersistent', 'Database', 'ConnectionsSafe', 'ModeCommand', 'line', 'usageGarbage', 'CollectionDTrace', 'Dynamic', 'TracingFunction', 'ReferenceAffecting', \"PHP's\", 'BehaviourAudio', 'Formats', 'ManipulationAuthentication', 'ServicesCommand', 'Line', 'Specific', 'ExtensionsCompression', 'and', 'Archive', 'ExtensionsCredit', 'Card', 'ProcessingCryptography', 'ExtensionsDatabase', 'ExtensionsDate', 'and', 'Time', 'Related', 'ExtensionsFile', 'System', 'Related', 'ExtensionsHuman', 'Language', 'and', 'Character', 'Encoding', 'SupportImage', 'Processing', 'and', 'GenerationMail', 'Related', 'ExtensionsMathematical', 'ExtensionsNon-Text', 'MIME', 'OutputProcess', 'Control', 'ExtensionsOther', 'Basic', 'ExtensionsOther', 'ServicesSearch', 'Engine', 'ExtensionsServer', 'Specific', 'ExtensionsSession', 'ExtensionsText', 'ProcessingVariable', 'and', 'Type', 'Related', 'ExtensionsWeb', 'ServicesWindows', 'Only', 'ExtensionsXML', 'ManipulationGUI', 'ExtensionsKeyboard', 'Shortcuts?This', 'helpjNext', 'menu', 'itemkPrevious', 'menu', 'itemg', 'pPrevious', 'man', 'pageg', 'nNext', 'man', 'pageGScroll', 'to', 'bottomg', 'gScroll', 'to', 'topg', 'hGoto', 'homepageg', 'sGoto', 'search(current', 'page)/Focus', 'search', 'boxPHP', 'is', 'a', 'popular', 'general-purpose', 'scripting', 'language', 'that', 'is', 'especially', 'suited', 'to', 'web', 'development.Fast,', 'flexible', 'and', 'pragmatic,', 'PHP', 'powers', 'everything', 'from', 'your', 'blog', 'to', 'the', 'most', 'popular', 'websites', 'in', 'the', 'world.Download5.6.36·Release', 'Notes·Upgrading7.0.30·Release', 'Notes·Upgrading7.1.18·Release', 'Notes·Upgrading7.2.7·Release', 'Notes·Upgrading21', 'Jun', '2018PHP', '7.2.7', 'ReleasedPHP', '7.2.7', 'Release', 'AnnouncementThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.7.', 'This', 'is', 'a', 'primarily', 'a', 'bugfix', 'release', 'which', 'includes', 'a', 'segfault', 'fix', 'for', 'opcache.PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.7', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.21', 'Jun', '2018PHP', '7.3.0', 'alpha', '2', 'ReleasedThe', 'PHP', 'team', 'is', 'glad', 'to', 'announce', 'the', 'release', 'of', 'the', 'second', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '2.', 'The', 'rough', 'outline', 'of', 'the', 'PHP', '7.3', 'release', 'cycle', 'is', 'specified', 'in', 'thePHP', 'Wiki.For', 'source', 'downloads', 'of', 'PHP', '7.3.0', 'Alpha', '2', 'please', 'visit', 'thedownload', 'page.', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/qa/.Please', 'carefully', 'test', 'this', 'version', 'and', 'report', 'any', 'issues', 'found', 'in', 'thebug', 'reporting', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.The', 'next', 'release', 'would', 'be', 'Alpha', '3,', 'planned', 'for', 'July', '5.The', 'signatures', 'for', 'the', 'release', 'can', 'be', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.07', 'Jun', '2018PHP', '7.3.0', 'alpha', '1', 'ReleasedPHP', 'team', 'is', 'glad', 'to', 'announce', 'the', 'release', 'of', 'the', 'first', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '1.', 'This', 'starts', 'the', 'PHP', '7.3', 'release', 'cycle,', 'the', 'rough', 'outline', 'of', 'which', 'is', 'specified', 'in', 'thePHP', 'Wiki.For', 'source', 'downloads', 'of', 'PHP', '7.3.0', 'Alpha', '1', 'please', 'visit', 'thedownload', 'page.Please', 'carefully', 'test', 'this', 'version', 'and', 'report', 'any', 'issues', 'found', 'in', 'thebug', 'reporting', 'system.Please', 'DO', 'NOT', 'use', 'this', 'version', 'in', 'production,', 'it', 'is', 'an', 'early', 'test', 'version.For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.The', 'next', 'release', 'would', 'be', 'Alpha', '2,', 'planned', 'for', 'June', '21.The', 'signatures', 'for', 'the', 'release', 'can', 'be', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.25', 'May', '2018PHP', '7.1.18', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.18.', 'All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.18', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.24', 'May', '2018PHP', '7.2.6', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.6.', 'This', 'is', 'a', 'primarily', 'a', 'bugfix', 'release', 'which', 'includes', 'a', 'memory', 'corruption', 'fix', 'for', 'EXIF.PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.6', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '5.6.36', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '5.6.36.', 'This', 'is', 'a', 'security', 'release.', 'Several', 'security', 'bugs', 'have', 'been', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '5.6', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '5.6.36', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '7.1.17', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.17.', 'This', 'is', 'a', 'security', 'fix', 'release,', 'containing', 'many', 'bugfixes.', 'All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.17', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '7.0.30', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.0.30.', 'This', 'is', 'a', 'security', 'release.', 'Several', 'security', 'bugs', 'have', 'been', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '7.0', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.0.30', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Feb', '2018PHP', '7.2.2', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.2.', 'This', 'is', 'a', 'bugfix', 'release,', 'with', 'several', 'bug', 'fixes', 'included.All', 'PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.2', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.12', 'Oct', '2017PHP', '7.2.0', 'Release', 'Candidate', '4', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC4.', 'This', 'release', 'is', 'the', 'fourth', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '4', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '26th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.28', 'Sep', '2017PHP', '7.2.0', 'Release', 'Candidate', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC3.', 'This', 'release', 'is', 'the', 'third', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '12th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.31', 'Aug', '2017PHP', '7.2.0', 'Release', 'Candidate', '1', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1.', 'This', 'release', 'is', 'the', 'first', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'second', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '14th', 'of', 'September.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.17', 'Aug', '2017PHP', '7.2.0', 'Beta', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Beta', '3.', 'This', 'release', 'is', 'the', 'third', 'and', 'final', 'beta', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Beta', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'first', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '31th', 'of', 'August.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.06', 'Jul', '2017PHP', '7.2.0', 'Alpha', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Alpha', '3.', 'This', 'release', 'contains', 'fixes', 'and', 'improvements', 'relative', 'to', 'Alpha', '2.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'on', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Alpha', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/qa/.The', 'first', 'beta', 'will', 'be', 'released', 'on', 'the', '20th', 'of', 'July.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'on', 'ourwiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.Older', 'News', 'EntriesConferences', 'calling', 'for', 'papersphp[world]', '2018Upcoming', 'conferencesLaravelConf', 'Taiwan', '2018Mid-Atlantic', 'Developer', 'ConferenceInternational', 'PHP', 'Conference', '2018', '-', 'Fall', 'EditionCoderCruise', '2018', '-', 'The', 'Bahamas!User', 'Group', 'EventsSpecial', 'ThanksSocial', 'media@official_phpCopyright', '©', '2001-2018', 'The', 'PHP', 'GroupMy', 'PHP.netContactOther', 'PHP.net', 'sitesMirror', 'sitesPrivacy', 'policy']\n" + ] + } + ], + "source": [ + "# the main work starts....\n", + "\n", + "\n", + "tokens = [t for t in text.split()]\n", + "print(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PHP::1\n", + "Hypertext:1\n", + "PreprocessorDownloadsDocumentationGet:1\n", + "InvolvedHelpGetting:1\n", + "StartedIntroductionA:1\n", + "simple:1\n", + "tutorialLanguage:1\n", + "ReferenceBasic:1\n", + "syntaxTypesVariablesConstantsExpressionsOperatorsControl:1\n", + "StructuresFunctionsClasses:1\n", + "and:44\n", + "ObjectsNamespacesErrorsExceptionsGeneratorsReferences:1\n", + "ExplainedPredefined:1\n", + "VariablesPredefined:1\n", + "ExceptionsPredefined:1\n", + "Interfaces:1\n", + "ClassesContext:1\n", + "options:1\n", + "parametersSupported:1\n", + "Protocols:1\n", + "WrappersSecurityIntroductionGeneral:1\n", + "considerationsInstalled:1\n", + "as:2\n", + "CGI:1\n", + "binaryInstalled:1\n", + "an:2\n", + "Apache:1\n", + "moduleSession:1\n", + "SecurityFilesystem:1\n", + "SecurityDatabase:1\n", + "SecurityError:1\n", + "ReportingUsing:1\n", + "Register:1\n", + "GlobalsUser:1\n", + "Submitted:1\n", + "DataMagic:1\n", + "QuotesHiding:1\n", + "PHPKeeping:1\n", + "CurrentFeaturesHTTP:1\n", + "authentication:1\n", + "with:3\n", + "PHPCookiesSessionsDealing:1\n", + "XFormsHandling:1\n", + "file:1\n", + "uploadsUsing:1\n", + "remote:1\n", + "filesConnection:1\n", + "handlingPersistent:1\n", + "Database:1\n", + "ConnectionsSafe:1\n", + "ModeCommand:1\n", + "line:1\n", + "usageGarbage:1\n", + "CollectionDTrace:1\n", + "Dynamic:1\n", + "TracingFunction:1\n", + "ReferenceAffecting:1\n", + "PHP's:1\n", + "BehaviourAudio:1\n", + "Formats:1\n", + "ManipulationAuthentication:1\n", + "ServicesCommand:1\n", + "Line:1\n", + "Specific:2\n", + "ExtensionsCompression:1\n", + "Archive:1\n", + "ExtensionsCredit:1\n", + "Card:1\n", + "ProcessingCryptography:1\n", + "ExtensionsDatabase:1\n", + "ExtensionsDate:1\n", + "Time:1\n", + "Related:4\n", + "ExtensionsFile:1\n", + "System:1\n", + "ExtensionsHuman:1\n", + "Language:1\n", + "Character:1\n", + "Encoding:1\n", + "SupportImage:1\n", + "Processing:1\n", + "GenerationMail:1\n", + "ExtensionsMathematical:1\n", + "ExtensionsNon-Text:1\n", + "MIME:1\n", + "OutputProcess:1\n", + "Control:1\n", + "ExtensionsOther:2\n", + "Basic:1\n", + "ServicesSearch:1\n", + "Engine:1\n", + "ExtensionsServer:1\n", + "ExtensionsSession:1\n", + "ExtensionsText:1\n", + "ProcessingVariable:1\n", + "Type:1\n", + "ExtensionsWeb:1\n", + "ServicesWindows:1\n", + "Only:1\n", + "ExtensionsXML:1\n", + "ManipulationGUI:1\n", + "ExtensionsKeyboard:1\n", + "Shortcuts?This:1\n", + "helpjNext:1\n", + "menu:2\n", + "itemkPrevious:1\n", + "itemg:1\n", + "pPrevious:1\n", + "man:2\n", + "pageg:1\n", + "nNext:1\n", + "pageGScroll:1\n", + "to:26\n", + "bottomg:1\n", + "gScroll:1\n", + "topg:1\n", + "hGoto:1\n", + "homepageg:1\n", + "sGoto:1\n", + "search(current:1\n", + "page)/Focus:1\n", + "search:1\n", + "boxPHP:1\n", + "is:24\n", + "a:18\n", + "popular:2\n", + "general-purpose:1\n", + "scripting:1\n", + "language:1\n", + "that:1\n", + "especially:1\n", + "suited:1\n", + "web:1\n", + "development.Fast,:1\n", + "flexible:1\n", + "pragmatic,:1\n", + "PHP:65\n", + "powers:1\n", + "everything:1\n", + "from:1\n", + "your:1\n", + "blog:1\n", + "the:50\n", + "most:1\n", + "websites:1\n", + "in:27\n", + "world.Download5.6.36·Release:1\n", + "Notes·Upgrading7.0.30·Release:1\n", + "Notes·Upgrading7.1.18·Release:1\n", + "Notes·Upgrading7.2.7·Release:1\n", + "Notes·Upgrading21:1\n", + "Jun:3\n", + "2018PHP:9\n", + "7.2.7:3\n", + "ReleasedPHP:2\n", + "Release:15\n", + "AnnouncementThe:1\n", + "development:12\n", + "team:14\n", + "announces:12\n", + "immediate:12\n", + "availability:12\n", + "of:59\n", + "7.2.7.:1\n", + "This:12\n", + "primarily:2\n", + "bugfix:3\n", + "release:22\n", + "which:3\n", + "includes:2\n", + "segfault:1\n", + "fix:3\n", + "for:25\n", + "opcache.PHP:1\n", + "7.2:3\n", + "users:12\n", + "are:12\n", + "encouraged:12\n", + "upgrade:7\n", + "this:17\n", + "version.For:8\n", + "source:21\n", + "downloads:14\n", + "please:14\n", + "visit:14\n", + "ourdownloads:7\n", + "page,:7\n", + "Windows:13\n", + "binaries:13\n", + "can:34\n", + "be:29\n", + "found:24\n", + "onwindows.php.net/download/.:7\n", + "The:10\n", + "list:19\n", + "changes:7\n", + "recorded:7\n", + "theChangeLog.21:1\n", + "7.3.0:8\n", + "alpha:2\n", + "2:2\n", + "ReleasedThe:12\n", + "glad:2\n", + "announce:2\n", + "second:2\n", + "version,:2\n", + "Alpha:10\n", + "2.:2\n", + "rough:2\n", + "outline:2\n", + "7.3:2\n", + "cycle:1\n", + "specified:2\n", + "thePHP:2\n", + "Wiki.For:2\n", + "thedownload:2\n", + "page.:1\n", + "sources:6\n", + "onwindows.php.net/qa/.Please:1\n", + "carefully:2\n", + "test:8\n", + "version:8\n", + "report:7\n", + "any:7\n", + "issues:2\n", + "thebug:7\n", + "reporting:2\n", + "system.THIS:6\n", + "IS:6\n", + "A:6\n", + "DEVELOPMENT:6\n", + "PREVIEW:6\n", + "-:8\n", + "DO:7\n", + "NOT:7\n", + "USE:6\n", + "IT:6\n", + "IN:6\n", + "PRODUCTION!For:6\n", + "more:6\n", + "information:7\n", + "on:13\n", + "new:7\n", + "features:7\n", + "other:7\n", + "changes,:7\n", + "you:14\n", + "read:12\n", + "theNEWSfile,:7\n", + "or:7\n", + "theUPGRADINGfile:7\n", + "complete:7\n", + "upgrading:7\n", + "notes.:7\n", + "These:7\n", + "files:7\n", + "also:12\n", + "archive.The:2\n", + "next:4\n", + "would:2\n", + "3,:1\n", + "planned:7\n", + "July:1\n", + "5.The:1\n", + "signatures:2\n", + "inthe:2\n", + "manifestor:2\n", + "onthe:2\n", + "QA:2\n", + "site.Thank:2\n", + "helping:7\n", + "us:7\n", + "make:7\n", + "better.07:1\n", + "1:4\n", + "first:4\n", + "1.:2\n", + "starts:1\n", + "cycle,:1\n", + "page.Please:1\n", + "system.Please:1\n", + "use:1\n", + "production,:1\n", + "it:1\n", + "early:1\n", + "2,:1\n", + "June:1\n", + "21.The:1\n", + "better.25:1\n", + "May:2\n", + "7.1.18:2\n", + "7.1.18.:1\n", + "All:9\n", + "7.1:2\n", + "theChangeLog.24:1\n", + "7.2.6:2\n", + "7.2.6.:1\n", + "memory:1\n", + "corruption:1\n", + "EXIF.PHP:1\n", + "theChangeLog.26:3\n", + "Apr:3\n", + "5.6.36:2\n", + "5.6.36.:1\n", + "security:5\n", + "release.:4\n", + "Several:2\n", + "bugs:7\n", + "have:2\n", + "been:2\n", + "fixed:2\n", + "5.6:1\n", + "7.1.17:2\n", + "7.1.17.:1\n", + "release,:2\n", + "containing:1\n", + "many:1\n", + "bugfixes.:1\n", + "7.0.30:2\n", + "7.0.30.:1\n", + "7.0:1\n", + "theChangeLog.01:1\n", + "Feb:1\n", + "7.2.2:2\n", + "7.2.2.:1\n", + "several:1\n", + "bug:1\n", + "fixes:2\n", + "included.All:1\n", + "theChangeLog.12:1\n", + "Oct:1\n", + "2017PHP:5\n", + "7.2.0:15\n", + "Candidate:14\n", + "4:2\n", + "RC4.:1\n", + "fourth:1\n", + "7.2.0.:4\n", + "carefully,:5\n", + "incompatibilities:5\n", + "tracking:5\n", + "archive.For:5\n", + "thedownloadpage,:5\n", + "atwindows.php.net/qa/.The:4\n", + "will:5\n", + "announced:2\n", + "26th:1\n", + "October.:2\n", + "You:5\n", + "full:5\n", + "releases:5\n", + "onour:4\n", + "wiki.Thank:4\n", + "better.28:1\n", + "Sep:1\n", + "3:6\n", + "RC3.:1\n", + "third:2\n", + "12th:1\n", + "better.31:1\n", + "Aug:2\n", + "released:3\n", + "14th:1\n", + "September.:1\n", + "better.17:1\n", + "Beta:3\n", + "3.:2\n", + "final:1\n", + "beta:2\n", + "31th:1\n", + "August.:1\n", + "better.06:1\n", + "Jul:1\n", + "contains:1\n", + "improvements:1\n", + "relative:1\n", + "onwindows.php.net/qa/.The:1\n", + "20th:1\n", + "July.:1\n", + "ourwiki.Thank:1\n", + "better.Older:1\n", + "News:1\n", + "EntriesConferences:1\n", + "calling:1\n", + "papersphp[world]:1\n", + "2018Upcoming:1\n", + "conferencesLaravelConf:1\n", + "Taiwan:1\n", + "2018Mid-Atlantic:1\n", + "Developer:1\n", + "ConferenceInternational:1\n", + "Conference:1\n", + "2018:2\n", + "Fall:1\n", + "EditionCoderCruise:1\n", + "Bahamas!User:1\n", + "Group:1\n", + "EventsSpecial:1\n", + "ThanksSocial:1\n", + "media@official_phpCopyright:1\n", + "©:1\n", + "2001-2018:1\n", + "GroupMy:1\n", + "PHP.netContactOther:1\n", + "PHP.net:1\n", + "sitesMirror:1\n", + "sitesPrivacy:1\n", + "policy:1\n" + ] + } + ], + "source": [ + "# NLTK - Natural Language Tool Kit\n", + "\n", + "\n", + "freq = nltk.FreqDist(tokens)\n", + "\n", + "\n", + "# Show the frequency dictionary\n", + "\n", + "\n", + "for key,val in freq.items():\n", + " print (str(key) + ':' + str(val))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stopword list\n", + "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" + ] + } + ], + "source": [ + "# Remove stopwords - Stopword removal technique\n", + "\n", + "from nltk.corpus import stopwords\n", + "\n", + "\n", + "\n", + "# Copying an array completely\n", + "\n", + "clean_tokens = tokens[:]\n", + "\n", + "\n", + "\n", + "# Using english stopwords\n", + "\n", + "sr = stopwords.words('english')\n", + "print(\"Stopword list\")\n", + "print(sr)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic', 'syntaxTypesVariablesConstantsExpressionsOperatorsControl', 'StructuresFunctionsClasses', 'ObjectsNamespacesErrorsExceptionsGeneratorsReferences', 'ExplainedPredefined', 'VariablesPredefined', 'ExceptionsPredefined', 'Interfaces', 'ClassesContext', 'options', 'parametersSupported', 'Protocols', 'WrappersSecurityIntroductionGeneral', 'considerationsInstalled', 'CGI', 'binaryInstalled', 'Apache', 'moduleSession', 'SecurityFilesystem', 'SecurityDatabase', 'SecurityError', 'ReportingUsing', 'Register', 'GlobalsUser', 'Submitted', 'DataMagic', 'QuotesHiding', 'PHPKeeping', 'CurrentFeaturesHTTP', 'authentication', 'PHPCookiesSessionsDealing', 'XFormsHandling', 'file', 'uploadsUsing', 'remote', 'filesConnection', 'handlingPersistent', 'Database', 'ConnectionsSafe', 'ModeCommand', 'line', 'usageGarbage', 'CollectionDTrace', 'Dynamic', 'TracingFunction', 'ReferenceAffecting', \"PHP's\", 'BehaviourAudio', 'Formats', 'ManipulationAuthentication', 'ServicesCommand', 'Line', 'Specific', 'ExtensionsCompression', 'Archive', 'ExtensionsCredit', 'Card', 'ProcessingCryptography', 'ExtensionsDatabase', 'ExtensionsDate', 'Time', 'Related', 'ExtensionsFile', 'System', 'Related', 'ExtensionsHuman', 'Language', 'Character', 'Encoding', 'SupportImage', 'Processing', 'GenerationMail', 'Related', 'ExtensionsMathematical', 'ExtensionsNon-Text', 'MIME', 'OutputProcess', 'Control', 'ExtensionsOther', 'Basic', 'ExtensionsOther', 'ServicesSearch', 'Engine', 'ExtensionsServer', 'Specific', 'ExtensionsSession', 'ExtensionsText', 'ProcessingVariable', 'Type', 'Related', 'ExtensionsWeb', 'ServicesWindows', 'Only', 'ExtensionsXML', 'ManipulationGUI', 'ExtensionsKeyboard', 'Shortcuts?This', 'helpjNext', 'menu', 'itemkPrevious', 'menu', 'itemg', 'pPrevious', 'man', 'pageg', 'nNext', 'man', 'pageGScroll', 'bottomg', 'gScroll', 'topg', 'hGoto', 'homepageg', 'sGoto', 'search(current', 'page)/Focus', 'search', 'boxPHP', 'popular', 'general-purpose', 'scripting', 'language', 'especially', 'suited', 'web', 'development.Fast,', 'flexible', 'pragmatic,', 'PHP', 'powers', 'everything', 'blog', 'popular', 'websites', 'world.Download5.6.36·Release', 'Notes·Upgrading7.0.30·Release', 'Notes·Upgrading7.1.18·Release', 'Notes·Upgrading7.2.7·Release', 'Notes·Upgrading21', 'Jun', '2018PHP', '7.2.7', 'ReleasedPHP', '7.2.7', 'Release', 'AnnouncementThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.7.', 'This', 'primarily', 'bugfix', 'release', 'includes', 'segfault', 'fix', 'opcache.PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.7', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.21', 'Jun', '2018PHP', '7.3.0', 'alpha', '2', 'ReleasedThe', 'PHP', 'team', 'glad', 'announce', 'release', 'second', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '2.', 'The', 'rough', 'outline', 'PHP', '7.3', 'release', 'cycle', 'specified', 'thePHP', 'Wiki.For', 'source', 'downloads', 'PHP', '7.3.0', 'Alpha', '2', 'please', 'visit', 'thedownload', 'page.', 'Windows', 'sources', 'binaries', 'found', 'onwindows.php.net/qa/.Please', 'carefully', 'test', 'version', 'report', 'issues', 'found', 'thebug', 'reporting', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.The', 'next', 'release', 'would', 'Alpha', '3,', 'planned', 'July', '5.The', 'signatures', 'release', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'helping', 'us', 'make', 'PHP', 'better.07', 'Jun', '2018PHP', '7.3.0', 'alpha', '1', 'ReleasedPHP', 'team', 'glad', 'announce', 'release', 'first', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '1.', 'This', 'starts', 'PHP', '7.3', 'release', 'cycle,', 'rough', 'outline', 'specified', 'thePHP', 'Wiki.For', 'source', 'downloads', 'PHP', '7.3.0', 'Alpha', '1', 'please', 'visit', 'thedownload', 'page.Please', 'carefully', 'test', 'version', 'report', 'issues', 'found', 'thebug', 'reporting', 'system.Please', 'DO', 'NOT', 'use', 'version', 'production,', 'early', 'test', 'version.For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.The', 'next', 'release', 'would', 'Alpha', '2,', 'planned', 'June', '21.The', 'signatures', 'release', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'helping', 'us', 'make', 'PHP', 'better.25', 'May', '2018PHP', '7.1.18', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.1.18.', 'All', 'PHP', '7.1', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.1.18', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.24', 'May', '2018PHP', '7.2.6', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.6.', 'This', 'primarily', 'bugfix', 'release', 'includes', 'memory', 'corruption', 'fix', 'EXIF.PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.6', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '5.6.36', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '5.6.36.', 'This', 'security', 'release.', 'Several', 'security', 'bugs', 'fixed', 'release.', 'All', 'PHP', '5.6', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '5.6.36', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '7.1.17', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.1.17.', 'This', 'security', 'fix', 'release,', 'containing', 'many', 'bugfixes.', 'All', 'PHP', '7.1', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.1.17', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '7.0.30', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.0.30.', 'This', 'security', 'release.', 'Several', 'security', 'bugs', 'fixed', 'release.', 'All', 'PHP', '7.0', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.0.30', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.01', 'Feb', '2018PHP', '7.2.2', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.2.', 'This', 'bugfix', 'release,', 'several', 'bug', 'fixes', 'included.All', 'PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.2', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.12', 'Oct', '2017PHP', '7.2.0', 'Release', 'Candidate', '4', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'RC4.', 'This', 'release', 'fourth', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '4', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'announced', '26th', 'October.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.28', 'Sep', '2017PHP', '7.2.0', 'Release', 'Candidate', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'RC3.', 'This', 'release', 'third', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'announced', '12th', 'October.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.31', 'Aug', '2017PHP', '7.2.0', 'Release', 'Candidate', '1', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Release', 'Candidate', '1.', 'This', 'release', 'first', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '1', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'second', 'Release', 'Candidate', 'released', '14th', 'September.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.17', 'Aug', '2017PHP', '7.2.0', 'Beta', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Beta', '3.', 'This', 'release', 'third', 'final', 'beta', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Beta', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'first', 'Release', 'Candidate', 'released', '31th', 'August.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.06', 'Jul', '2017PHP', '7.2.0', 'Alpha', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Alpha', '3.', 'This', 'release', 'contains', 'fixes', 'improvements', 'relative', 'Alpha', '2.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Alpha', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'onwindows.php.net/qa/.The', 'first', 'beta', 'released', '20th', 'July.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'ourwiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.Older', 'News', 'EntriesConferences', 'calling', 'papersphp[world]', '2018Upcoming', 'conferencesLaravelConf', 'Taiwan', '2018Mid-Atlantic', 'Developer', 'ConferenceInternational', 'PHP', 'Conference', '2018', '-', 'Fall', 'EditionCoderCruise', '2018', '-', 'The', 'Bahamas!User', 'Group', 'EventsSpecial', 'ThanksSocial', 'media@official_phpCopyright', '©', '2001-2018', 'The', 'PHP', 'GroupMy', 'PHP.netContactOther', 'PHP.net', 'sitesMirror', 'sitesPrivacy', 'policy']\n" + ] + } + ], + "source": [ + "# Iterate over all the tokens\n", + "for token in tokens:\n", + " if token in stopwords.words('english'):\n", + " clean_tokens.remove(token)\n", + "print(clean_tokens) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/arunav/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "['Hello Dr. Adam, how are you?', 'I hope everything is going well.', 'Today is a good day, see you dude.']\n", + "['Hello', 'Dr.', 'Adam', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'everything', 'is', 'going', 'well', '.', 'Today', 'is', 'a', 'good', 'day', ',', 'see', 'you', 'dude', '.']\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download('punkt')\n", + "#Tokenizing sentences\n", + "\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "mytext = \"Hello Dr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude.\"\n", + "\n", + "# Sentence tokenizer on mytext\n", + "print(sent_tokenize(mytext))\n", + "\n", + "# Word tokenizer on mytext\n", + "print(word_tokenize(mytext))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/arunav/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "Working for word PAIN:\n", + "a symptom of some physical hurt or disorder\n", + "['the patient developed severe pain and distension']\n", + "Working for word NLP:\n", + "the branch of information science that deals with natural language information\n", + "*** all definitions of python ***\n", + "large Old World boas\n", + "a soothsaying spirit or a person who is possessed by such a spirit\n", + "(Greek mythology) dragon killed by Apollo at Delphi\n" + ] + } + ], + "source": [ + "# we can get definition from Wordnet\n", + "# for that we have to import wordnet\n", + "nltk.download('wordnet')\n", + "\n", + "from nltk.corpus import wordnet\n", + "\n", + "# Start using wordnet for a particular word\n", + "\n", + "# 1st word is \"pain\"\n", + "syn = wordnet.synsets(\"pain\")\n", + "print(\"Working for word PAIN:\")\n", + "print(syn[0].definition())\n", + "print(syn[0].examples())\n", + "\n", + "#2nd word is \"NLP\"\n", + "print(\"Working for word NLP:\")\n", + "syn = wordnet.synsets(\"NLP\")\n", + "print(syn[0].definition())\n", + "\n", + "#3rd word is \"Python\"\n", + "syn = wordnet.synsets(\"Python\")\n", + "print(\"*** all definitions of python ***\")\n", + "for items in syn:\n", + " print(items.definition())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system', 'calculator', 'reckoner', 'figurer', 'estimator', 'computer']\n" + ] + } + ], + "source": [ + "# Get synonyms of words\n", + "from nltk.corpus import wordnet\n", + "synonyms = []\n", + "for syn in wordnet.synsets('Computer'):\n", + " for lemma in syn.lemmas():\n", + " synonyms.append(lemma.name())\n", + "print(synonyms)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stone\n", + "speak\n", + "bedroom\n", + "joke\n", + "lisa\n", + "purpl\n", + "univers\n", + "univers\n", + "----------------------\n", + "stone\n", + "speaking\n", + "bedroom\n", + "joke\n", + "lisa\n", + "domicile\n", + "university\n" + ] + } + ], + "source": [ + "# Stemming and Lemmatization\n", + "\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "print(stemmer.stem('stones'))\n", + "print(stemmer.stem('speaking'))\n", + "print(stemmer.stem('bedroom'))\n", + "print(stemmer.stem('jokes'))\n", + "print(stemmer.stem('lisa'))\n", + "print(stemmer.stem('purple'))\n", + "print(stemmer.stem('university'))\n", + "print(stemmer.stem('universe'))\n", + "print('----------------------')\n", + "print(lemmatizer.lemmatize('stones'))\n", + "print(lemmatizer.lemmatize('speaking'))\n", + "print(lemmatizer.lemmatize('bedroom'))\n", + "print(lemmatizer.lemmatize('jokes'))\n", + "print(lemmatizer.lemmatize('lisa'))\n", + "print(lemmatizer.lemmatize('domicile'))\n", + "print(lemmatizer.lemmatize('university'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords.zip b/ArunavD/nltk_test/nltk_data/corpora/stopwords.zip new file mode 100644 index 0000000..e781406 Binary files /dev/null and b/ArunavD/nltk_test/nltk_data/corpora/stopwords.zip differ diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/README b/ArunavD/nltk_test/nltk_data/corpora/stopwords/README new file mode 100644 index 0000000..debf14c --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/README @@ -0,0 +1,32 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + +The stop words for the Romanian language were obtained from: +http://arlc.ro/resources/ + +The English list has been augmented +https://github.com/nltk/nltk_data/issues/22 + +The German list has been corrected +https://github.com/nltk/nltk_data/pull/49 + +A Kazakh list has been added +https://github.com/nltk/nltk_data/pull/52 + +A Nepali list has been added +https://github.com/nltk/nltk_data/pull/83 + +An Azerbaijani list has been added +https://github.com/nltk/nltk_data/pull/100 + +A Greek list has been added +https://github.com/nltk/nltk_data/pull/103 + +An Indonesian list has been added +https://github.com/nltk/nltk_data/pull/112 diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/arabic b/ArunavD/nltk_test/nltk_data/corpora/stopwords/arabic new file mode 100644 index 0000000..19c9ad8 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/arabic @@ -0,0 +1,248 @@ +إذ +إذا +إذما +إذن +أف +أقل +أكثر +ألا +إلا +التي +الذي +الذين +اللاتي +اللائي +اللتان +اللتيا +اللتين +اللذان +اللذين +اللواتي +إلى +إليك +إليكم +إليكما +إليكن +أم +أما +أما +إما +أن +إن +إنا +أنا +أنت +أنتم +أنتما +أنتن +إنما +إنه +أنى +أنى +آه +آها +أو +أولاء +أولئك +أوه +آي +أي +أيها +إي +أين +أين +أينما +إيه +بخ +بس +بعد +بعض +بك +بكم +بكم +بكما +بكن +بل +بلى +بما +بماذا +بمن +بنا +به +بها +بهم +بهما +بهن +بي +بين +بيد +تلك +تلكم +تلكما +ته +تي +تين +تينك +ثم +ثمة +حاشا +حبذا +حتى +حيث +حيثما +حين +خلا +دون +ذا +ذات +ذاك +ذان +ذانك +ذلك +ذلكم +ذلكما +ذلكن +ذه +ذو +ذوا +ذواتا +ذواتي +ذي +ذين +ذينك +ريث +سوف +سوى +شتان +عدا +عسى +عل +على +عليك +عليه +عما +عن +عند +غير +فإذا +فإن +فلا +فمن +في +فيم +فيما +فيه +فيها +قد +كأن +كأنما +كأي +كأين +كذا +كذلك +كل +كلا +كلاهما +كلتا +كلما +كليكما +كليهما +كم +كم +كما +كي +كيت +كيف +كيفما +لا +لاسيما +لدى +لست +لستم +لستما +لستن +لسن +لسنا +لعل +لك +لكم +لكما +لكن +لكنما +لكي +لكيلا +لم +لما +لن +لنا +له +لها +لهم +لهما +لهن +لو +لولا +لوما +لي +لئن +ليت +ليس +ليسا +ليست +ليستا +ليسوا +ما +ماذا +متى +مذ +مع +مما +ممن +من +منه +منها +منذ +مه +مهما +نحن +نحو +نعم +ها +هاتان +هاته +هاتي +هاتين +هاك +هاهنا +هذا +هذان +هذه +هذي +هذين +هكذا +هل +هلا +هم +هما +هن +هنا +هناك +هنالك +هو +هؤلاء +هي +هيا +هيت +هيهات +والذي +والذين +وإذ +وإذا +وإن +ولا +ولكن +ولو +وما +ومن +وهو +يا diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/azerbaijani b/ArunavD/nltk_test/nltk_data/corpora/stopwords/azerbaijani new file mode 100644 index 0000000..27bf294 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/azerbaijani @@ -0,0 +1,165 @@ +a +ad +altı +altmış +amma +arasında +artıq +ay +az +bax +belə +bəli +bəlkə +beş +bəy +bəzən +bəzi +bilər +bir +biraz +biri +birşey +biz +bizim +bizlər +bu +buna +bundan +bunların +bunu +bunun +buradan +bütün +ci +cı +çox +cu +cü +çünki +da +daha +də +dedi +dək +dən +dəqiqə +deyil +dir +doqquz +doqsan +dörd +düz +ə +edən +edir +əgər +əlbəttə +elə +əlli +ən +əslində +et +etdi +etmə +etmək +faiz +gilə +görə +ha +haqqında +harada +hə +heç +həm +həmin +həmişə +hər +ı +idi +iki +il +ildə +ilə +ilk +in +indi +isə +istifadə +iyirmi +ki +kim +kimə +kimi +lakin +lap +məhz +mən +mənə +mirşey +nə +nəhayət +niyə +o +obirisi +of +olan +olar +olaraq +oldu +olduğu +olmadı +olmaz +olmuşdur +olsun +olur +on +ona +ondan +onlar +onlardan +onların +onsuzda +onu +onun +oradan +otuz +öz +özü +qarşı +qədər +qırx +saat +sadəcə +saniyə +səhv +səkkiz +səksən +sən +sənə +sənin +siz +sizin +sizlər +sonra +təəssüf +ü +üç +üçün +var +və +xan +xanım +xeyr +ya +yalnız +yaxşı +yeddi +yenə +yəni +yetmiş +yox +yoxdur +yoxsa +yüz +zaman \ No newline at end of file diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/danish b/ArunavD/nltk_test/nltk_data/corpora/stopwords/danish new file mode 100644 index 0000000..d3edc67 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/danish @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/dutch b/ArunavD/nltk_test/nltk_data/corpora/stopwords/dutch new file mode 100644 index 0000000..cafa032 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/dutch @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/english b/ArunavD/nltk_test/nltk_data/corpora/stopwords/english new file mode 100644 index 0000000..1280aa3 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/english @@ -0,0 +1,179 @@ +i +me +my +myself +we +our +ours +ourselves +you +you're +you've +you'll +you'd +your +yours +yourself +yourselves +he +him +his +himself +she +she's +her +hers +herself +it +it's +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +that'll +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +s +t +can +will +just +don +don't +should +should've +now +d +ll +m +o +re +ve +y +ain +aren +aren't +couldn +couldn't +didn +didn't +doesn +doesn't +hadn +hadn't +hasn +hasn't +haven +haven't +isn +isn't +ma +mightn +mightn't +mustn +mustn't +needn +needn't +shan +shan't +shouldn +shouldn't +wasn +wasn't +weren +weren't +won +won't +wouldn +wouldn't diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/finnish b/ArunavD/nltk_test/nltk_data/corpora/stopwords/finnish new file mode 100644 index 0000000..47ee200 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/finnish @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/french b/ArunavD/nltk_test/nltk_data/corpora/stopwords/french new file mode 100644 index 0000000..e7cbf4c --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/french @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/german b/ArunavD/nltk_test/nltk_data/corpora/stopwords/german new file mode 100644 index 0000000..49e8a18 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/german @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unsere +unserem +unseren +unser +unseres +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/greek b/ArunavD/nltk_test/nltk_data/corpora/stopwords/greek new file mode 100644 index 0000000..9d08b14 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/greek @@ -0,0 +1,265 @@ +αλλα +αν +αντι +απο +αυτα +αυτεσ +αυτη +αυτο +αυτοι +αυτοσ +αυτουσ +αυτων +αἱ +αἳ +αἵ +αὐτόσ +αὐτὸς +αὖ +γάρ +γα +γα^ +γε +για +γοῦν +γὰρ +δ' +δέ +δή +δαί +δαίσ +δαὶ +δαὶς +δε +δεν +δι' +διά +διὰ +δὲ +δὴ +δ’ +εαν +ειμαι +ειμαστε +ειναι +εισαι +ειστε +εκεινα +εκεινεσ +εκεινη +εκεινο +εκεινοι +εκεινοσ +εκεινουσ +εκεινων +ενω +επ +επι +εἰ +εἰμί +εἰμὶ +εἰς +εἰσ +εἴ +εἴμι +εἴτε +η +θα +ισωσ +κ +καί +καίτοι +καθ +και +κατ +κατά +κατα +κατὰ +καὶ +κι +κἀν +κἂν +μέν +μή +μήτε +μα +με +μεθ +μετ +μετά +μετα +μετὰ +μη +μην +μἐν +μὲν +μὴ +μὴν +να +ο +οι +ομωσ +οπωσ +οσο +οτι +οἱ +οἳ +οἷς +οὐ +οὐδ +οὐδέ +οὐδείσ +οὐδεὶς +οὐδὲ +οὐδὲν +οὐκ +οὐχ +οὐχὶ +οὓς +οὔτε +οὕτω +οὕτως +οὕτωσ +οὖν +οὗ +οὗτος +οὗτοσ +παρ +παρά +παρα +παρὰ +περί +περὶ +ποια +ποιεσ +ποιο +ποιοι +ποιοσ +ποιουσ +ποιων +ποτε +που +ποῦ +προ +προσ +πρόσ +πρὸ +πρὸς +πως +πωσ +σε +στη +στην +στο +στον +σόσ +σύ +σύν +σὸς +σὺ +σὺν +τά +τήν +τί +τίς +τίσ +τα +ταῖς +τε +την +τησ +τι +τινα +τις +τισ +το +τοί +τοι +τοιοῦτος +τοιοῦτοσ +τον +τοτε +του +τούσ +τοὺς +τοῖς +τοῦ +των +τό +τόν +τότε +τὰ +τὰς +τὴν +τὸ +τὸν +τῆς +τῆσ +τῇ +τῶν +τῷ +ωσ +ἀλλ' +ἀλλά +ἀλλὰ +ἀλλ’ +ἀπ +ἀπό +ἀπὸ +ἀφ +ἂν +ἃ +ἄλλος +ἄλλοσ +ἄν +ἄρα +ἅμα +ἐάν +ἐγώ +ἐγὼ +ἐκ +ἐμόσ +ἐμὸς +ἐν +ἐξ +ἐπί +ἐπεὶ +ἐπὶ +ἐστι +ἐφ +ἐὰν +ἑαυτοῦ +ἔτι +ἡ +ἢ +ἣ +ἤ +ἥ +ἧς +ἵνα +ὁ +ὃ +ὃν +ὃς +ὅ +ὅδε +ὅθεν +ὅπερ +ὅς +ὅσ +ὅστις +ὅστισ +ὅτε +ὅτι +ὑμόσ +ὑπ +ὑπέρ +ὑπό +ὑπὲρ +ὑπὸ +ὡς +ὡσ +ὥς +ὥστε +ὦ +ᾧ diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/hungarian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/hungarian new file mode 100644 index 0000000..94e9f9a --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/hungarian @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/indonesian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/indonesian new file mode 100644 index 0000000..bf88a45 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/indonesian @@ -0,0 +1,758 @@ +ada +adalah +adanya +adapun +agak +agaknya +agar +akan +akankah +akhir +akhiri +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +antara +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +artinya +asal +asalkan +atas +atau +ataukah +ataupun +awal +awalnya +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagi +bagian +bahkan +bahwa +bahwasanya +baik +bakal +bakalan +balik +banyak +bapak +baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekerja +belakang +belakangan +belum +belumlah +benar +benarkah +benarlah +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berawal +berbagai +berdatangan +beri +berikan +berikut +berikutnya +berjumlah +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermaksud +bermula +bersama +bersama-sama +bersiap +bersiap-siap +bertanya +bertanya-tanya +berturut +berturut-turut +bertutur +berujar +berupa +besar +betul +betulkah +biasa +biasanya +bila +bilakah +bisa +bisakah +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +bulan +bung +cara +caranya +cukup +cukupkah +cukuplah +cuma +dahulu +dalam +dan +dapat +dari +daripada +datang +dekat +demi +demikian +demikianlah +dengan +depan +di +dia +diakhiri +diakhirinya +dialah +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +didapat +didatangkan +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirinya +disampaikan +disebut +disebutkan +disebutkannya +disini +disinilah +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +dong +dua +dulu +empat +enggak +enggaknya +entah +entahlah +guna +gunakan +hal +hampir +hanya +hanyalah +hari +harus +haruslah +harusnya +hendak +hendaklah +hendaknya +hingga +ia +ialah +ibarat +ibaratkan +ibaratnya +ibu +ikut +ingat +ingat-ingat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jadi +jadilah +jadinya +jangan +jangankan +janganlah +jauh +jawab +jawaban +jawabnya +jelas +jelaskan +jelaslah +jelasnya +jika +jikalau +juga +jumlah +jumlahnya +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +karena +karenanya +kasus +kata +katakan +katakanlah +katanya +ke +keadaan +kebetulan +kecil +kedua +keduanya +keinginan +kelamaan +kelihatan +kelihatannya +kelima +keluar +kembali +kemudian +kemungkinan +kemungkinannya +kenapa +kepada +kepadanya +kesampaian +keseluruhan +keseluruhannya +keterlaluan +ketika +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +kok +kurang +lagi +lagian +lah +lain +lainnya +lalu +lama +lamanya +lanjut +lanjutnya +lebih +lewat +lima +luar +macam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +mau +maupun +melainkan +melakukan +melalui +melihat +melihatnya +memang +memastikan +memberi +memberikan +membuat +memerlukan +memihak +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambahkan +menandaskan +menanti +menanti-nanti +menantikan +menanya +menanyai +menanyakan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +mengakhiri +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggunakan +menghendaki +mengibaratkan +mengibaratkannya +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengungkapkan +menjadi +menjawab +menjelaskan +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyampaikan +menyangkut +menyatakan +menyebutkan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merupakan +meski +meskipun +meyakini +meyakinkan +minta +mirip +misal +misalkan +misalnya +mula +mulai +mulailah +mulanya +mungkin +mungkinkah +nah +naik +namun +nanti +nantinya +nyaris +nyatanya +oleh +olehnya +pada +padahal +padanya +pak +paling +panjang +pantas +para +pasti +pastilah +penting +pentingnya +per +percuma +perlu +perlukah +perlunya +pernah +persoalan +pertama +pertama-tama +pertanyaan +pertanyakan +pihak +pihaknya +pukul +pula +pun +punya +rasa +rasanya +rata +rupanya +saat +saatnya +saja +sajalah +saling +sama +sama-sama +sambil +sampai +sampai-sampai +sampaikan +sana +sangat +sangatlah +satu +saya +sayalah +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebaik +sebaik-baiknya +sebaiknya +sebaliknya +sebanyak +sebegini +sebegitu +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +seharusnya +sehingga +seingat +sejak +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekarang +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekurang-kurangnya +sekurangnya +sela +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +seluruh +seluruhnya +semacam +semakin +semampu +semampunya +semasa +semasih +semata +semata-mata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sendiri +sendirian +sendirinya +seolah +seolah-olah +seorang +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiba +setibanya +setidak-tidaknya +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +sini +sinilah +soal +soalnya +suatu +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tahu +tahun +tak +tambah +tambahnya +tampak +tampaknya +tandas +tandasnya +tanpa +tanya +tanyakan +tanyanya +tapi +tegas +tegasnya +telah +tempat +tengah +tentang +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbanyak +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +teringat-ingat +terjadi +terjadilah +terjadinya +terkira +terlalu +terlebih +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +tetap +tetapi +tiap +tiba +tiba-tiba +tidak +tidakkah +tidaklah +tiga +tinggi +toh +tunjuk +turut +tutur +tuturnya +ucap +ucapnya +ujar +ujarnya +umum +umumnya +ungkap +ungkapnya +untuk +usah +usai +waduh +wah +wahai +waktu +waktunya +walau +walaupun +wong +yaitu +yakin +yakni +yang \ No newline at end of file diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/italian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/italian new file mode 100644 index 0000000..6ee02b5 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/italian @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/kazakh b/ArunavD/nltk_test/nltk_data/corpora/stopwords/kazakh new file mode 100644 index 0000000..ebb9fc1 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/kazakh @@ -0,0 +1,380 @@ +ах +ох +эх +ай +эй +ой +тағы +тағыда +әрине +жоқ +сондай +осындай +осылай +солай +мұндай +бұндай +мен +сен +ол +біз +біздер +олар +сіз +сіздер +маған +оған +саған +біздің +сіздің +оның +бізге +сізге +оларға +біздерге +сіздерге +оларға +менімен +сенімен +онымен +бізбен +сізбен +олармен +біздермен +сіздермен +менің +сенің +біздің +сіздің +оның +біздердің +сіздердің +олардың +маған +саған +оған +менен +сенен +одан +бізден +сізден +олардан +біздерден +сіздерден +олардан +айтпақшы +сонымен +сондықтан +бұл +осы +сол +анау +мынау +сонау +осынау +ана +мына +сона +әні +міне +өй +үйт +бүйт +біреу +кейбіреу +кейбір +қайсыбір +әрбір +бірнеше +бірдеме +бірнеше +әркім +әрне +әрқайсы +әрқалай +әлдекім +әлдене +әлдеқайдан +әлденеше +әлдеқалай +әлдеқашан +алдақашан +еш +ешкім +ешбір +ештеме +дәнеңе +ешқашан +ешқандай +ешқайсы +емес +бәрі +барлық +барша +бар +күллі +бүкіл +түгел +өз +өзім +өзің +өзінің +өзіме +өзіне +өзімнің +өзі +өзге +менде +сенде +онда +менен +сенен онан +одан +ау +па +ей +әй +е +уа +уау +уай +я +пай +ә +о +оһо +ой +ие +аһа +ау +беу +мәссаған +бәрекелді +әттегенай +жаракімалла +масқарай +астапыралла +япырмай +ойпырмай +кәне +кәнеки +ал +әйда +кәні +міне +әні +сорап +қош-қош +пфша +пішә +құрау-құрау +шәйт +шек +моһ +тәк +құрау +құр +кә +кәһ +күшім +күшім +мышы +пырс +әукім +алақай +паһ-паһ +бәрекелді +ура +әттең +әттеген-ай +қап +түге +пішту +шіркін +алатау +пай-пай +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +арс + +гүрс + +дүрс + +қорс + +тарс + +тырс + +ырс + +барқ + +борт + +күрт + +кірт + +морт + +сарт + +шырт + +дүңк + +күңк + +қыңқ + +мыңқ + +маңқ + +саңқ + +шаңқ + +шіңк + +сыңқ + +таңқ + +тыңқ + +ыңқ + +болп + +былп + +жалп + +желп + +қолп + +ірк + +ырқ + +сарт-сұрт + +тарс-тұрс + +арс-ұрс + +жалт-жалт + +жалт-жұлт + +қалт-қалт + +қалт-құлт + +қаңқ-қаңқ + +қаңқ-құңқ + +шаңқ-шаңқ + +шаңқ-шұңқ + +арбаң-арбаң + +бүгжең-бүгжең + +арсалаң-арсалаң + +ербелең-ербелең + +батыр-бұтыр + +далаң-далаң + +тарбаң-тарбаң + +қызараң-қызараң + +қаңғыр-күңгір + +қайқаң-құйқаң + +митың-митың + +салаң-сұлаң + +ыржың-тыржың +бірақ +алайда +дегенмен +әйтпесе +әйткенмен +себебі +өйткені +сондықтан +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +арнайы +осындай +ғана +қана +тек +әншейін diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/nepali b/ArunavD/nltk_test/nltk_data/corpora/stopwords/nepali new file mode 100644 index 0000000..b2e4d34 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/nepali @@ -0,0 +1,255 @@ +छ +र +पनि +छन् +लागि +भएको +गरेको +भने +गर्न +गर्ने +हो +तथा +यो +रहेको +उनले +थियो +हुने +गरेका +थिए +गर्दै +तर +नै +को +मा +हुन् +भन्ने +हुन +गरी +त +हुन्छ +अब +के +रहेका +गरेर +छैन +दिए +भए +यस +ले +गर्नु +औं +सो +त्यो +कि +जुन +यी +का +गरि +ती +न +छु +छौं +लाई +नि +उप +अक्सर +आदि +कसरी +क्रमशः +चाले +अगाडी +अझै +अनुसार +अन्तर्गत +अन्य +अन्यत्र +अन्यथा +अरु +अरुलाई +अर्को +अर्थात +अर्थात् +अलग +आए +आजको +ओठ +आत्म +आफू +आफूलाई +आफ्नै +आफ्नो +आयो +उदाहरण +उनको +उहालाई +एउटै +एक +एकदम +कतै +कम से कम +कसै +कसैले +कहाँबाट +कहिलेकाहीं +का +किन +किनभने +कुनै +कुरा +कृपया +केही +कोही +गए +गरौं +गर्छ +गर्छु +गर्नुपर्छ +गयौ +गैर +चार +चाहनुहुन्छ +चाहन्छु +चाहिए +छू +जताततै +जब +जबकि +जसको +जसबाट +जसमा +जसलाई +जसले +जस्तै +जस्तो +जस्तोसुकै +जहाँ +जान +जाहिर +जे +जो +ठीक +तत्काल +तदनुसार +तपाईको +तपाई +पर्याप्त +पहिले +पहिलो +पहिल्यै +पाँच +पाँचौं +तल +तापनी +तिनी +तिनीहरू +तिनीहरुको +तिनिहरुलाई +तिमी +तिर +तीन +तुरुन्तै +तेस्रो +तेस्कारण +पूर्व +प्रति +प्रतेक +प्लस +फेरी +बने +त्सपछि +त्सैले +त्यहाँ +थिएन +दिनुभएको +दिनुहुन्छ +दुई +देखि +बरु +बारे +बाहिर +देखिन्छ +देखियो +देखे +देखेको +देखेर +दोस्रो +धेरै +नजिकै +नत्र +नयाँ +निम्ति +बाहेक +बीच +बीचमा +भन +निम्न +निम्नानुसार +निर्दिष्ट +नौ +पक्का +पक्कै +पछि +पछिल्लो +पटक +पर्छ +पर्थ्यो +भन्छन् +भन् +भन्छु +भन्दा +भन्नुभयो +भर +भित्र +भित्री +म +मलाई +मात्र +माथि +मुख्य +मेरो +यति +यथोचित +यदि +यद्यपि +यसको +यसपछि +यसबाहेक +यसरी +यसो +यस्तो +यहाँ +यहाँसम्म +या +रही +राखे +राख्छ +राम्रो +रूप +लगभग +वरीपरी +वास्तवमा +बिरुद्ध +बिशेष +सायद +शायद +संग +संगै +सक्छ +सट्टा +सधै +सबै +सबैलाई +समय +सम्भव +सम्म +सही +साँच्चै +सात +साथ +साथै +सारा +सोही +स्पष्ट +हरे +हरेक \ No newline at end of file diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/norwegian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/norwegian new file mode 100644 index 0000000..9ac1abb --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/norwegian @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/portuguese b/ArunavD/nltk_test/nltk_data/corpora/stopwords/portuguese new file mode 100644 index 0000000..6b24778 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/portuguese @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/romanian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/romanian new file mode 100644 index 0000000..45651c9 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/romanian @@ -0,0 +1,356 @@ +a +abia +acea +aceasta +această +aceea +aceeasi +acei +aceia +acel +acela +acelasi +acele +acelea +acest +acesta +aceste +acestea +acestei +acestia +acestui +aceşti +aceştia +adica +ai +aia +aibă +aici +al +ala +ale +alea +alt +alta +altceva +altcineva +alte +altfel +alti +altii +altul +am +anume +apoi +ar +are +as +asa +asta +astea +astfel +asupra +atare +atat +atata +atatea +atatia +ati +atit +atita +atitea +atitia +atunci +au +avea +avem +aveţi +avut +aş +aţi +ba +ca +cam +cand +care +careia +carora +caruia +cat +catre +ce +cea +ceea +cei +ceilalti +cel +cele +celor +ceva +chiar +ci +cind +cine +cineva +cit +cita +cite +citeva +citi +citiva +cu +cui +cum +cumva +cât +câte +câtva +câţi +cînd +cît +cîte +cîtva +cîţi +că +căci +cărei +căror +cărui +către +da +daca +dacă +dar +dat +dată +dau +de +deasupra +deci +decit +deja +desi +despre +deşi +din +dintr +dintr- +dintre +doar +doi +doilea +două +drept +dupa +după +dă +e +ea +ei +el +ele +era +eram +este +eu +eşti +face +fara +fata +fel +fi +fie +fiecare +fii +fim +fiu +fiţi +foarte +fost +fără +i +ia +iar +ii +il +imi +in +inainte +inapoi +inca +incit +insa +intr +intre +isi +iti +la +le +li +lor +lui +lângă +lîngă +m +ma +mai +mea +mei +mele +mereu +meu +mi +mie +mine +mod +mult +multa +multe +multi +multă +mulţi +mâine +mîine +mă +ne +ni +nici +nimeni +nimic +niste +nişte +noastre +noastră +noi +nostri +nostru +nou +noua +nouă +noştri +nu +numai +o +or +ori +oricare +orice +oricine +oricum +oricând +oricât +oricînd +oricît +oriunde +pai +parca +patra +patru +pe +pentru +peste +pic +pina +poate +pot +prea +prima +primul +prin +printr- +putini +puţin +puţina +puţină +până +pînă +sa +sa-mi +sa-ti +sai +sale +sau +se +si +sint +sintem +spate +spre +sub +sunt +suntem +sunteţi +sus +să +săi +său +t +ta +tale +te +ti +tine +toata +toate +toată +tocmai +tot +toti +totul +totusi +totuşi +toţi +trei +treia +treilea +tu +tuturor +tăi +tău +u +ul +ului +un +una +unde +undeva +unei +uneia +unele +uneori +unii +unor +unora +unu +unui +unuia +unul +v +va +vi +voastre +voastră +voi +vom +vor +vostru +vouă +voştri +vreo +vreun +vă +zi +zice +îi +îl +îmi +în +îţi +ăla +ălea +ăsta +ăstea +ăştia +şi +ţi +ţie \ No newline at end of file diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/russian b/ArunavD/nltk_test/nltk_data/corpora/stopwords/russian new file mode 100644 index 0000000..ecb83d4 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/russian @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/spanish b/ArunavD/nltk_test/nltk_data/corpora/stopwords/spanish new file mode 100644 index 0000000..59bc786 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/spanish @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/swedish b/ArunavD/nltk_test/nltk_data/corpora/stopwords/swedish new file mode 100644 index 0000000..742bb62 --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/swedish @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/ArunavD/nltk_test/nltk_data/corpora/stopwords/turkish b/ArunavD/nltk_test/nltk_data/corpora/stopwords/turkish new file mode 100644 index 0000000..5a48ccc --- /dev/null +++ b/ArunavD/nltk_test/nltk_data/corpora/stopwords/turkish @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani diff --git a/ArunavD/nltk_test/nltk_run.py.ipynb b/ArunavD/nltk_test/nltk_run.py.ipynb new file mode 100644 index 0000000..6042edd --- /dev/null +++ b/ArunavD/nltk_test/nltk_run.py.ipynb @@ -0,0 +1,825 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Create dictionary from each words\n", + "\n", + "\n", + "from bs4 import BeautifulSoup\n", + "import urllib.request\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Requesting the webpage\n", + "\n", + "\n", + "response = urllib.request.urlopen('http://php.net/')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading contents of the page\n", + "\n", + "\n", + "html = response.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic syntaxTypesVariablesConstantsExpressionsOperatorsControl StructuresFunctionsClasses and ObjectsNamespacesErrorsExceptionsGeneratorsReferences ExplainedPredefined VariablesPredefined ExceptionsPredefined Interfaces and ClassesContext options and parametersSupported Protocols and WrappersSecurityIntroductionGeneral considerationsInstalled as CGI binaryInstalled as an Apache moduleSession SecurityFilesystem SecurityDatabase SecurityError ReportingUsing Register GlobalsUser Submitted DataMagic QuotesHiding PHPKeeping CurrentFeaturesHTTP authentication with PHPCookiesSessionsDealing with XFormsHandling file uploadsUsing remote filesConnection handlingPersistent Database ConnectionsSafe ModeCommand line usageGarbage CollectionDTrace Dynamic TracingFunction ReferenceAffecting PHP's BehaviourAudio Formats ManipulationAuthentication ServicesCommand Line Specific ExtensionsCompression and Archive ExtensionsCredit Card ProcessingCryptography ExtensionsDatabase ExtensionsDate and Time Related ExtensionsFile System Related ExtensionsHuman Language and Character Encoding SupportImage Processing and GenerationMail Related ExtensionsMathematical ExtensionsNon-Text MIME OutputProcess Control ExtensionsOther Basic ExtensionsOther ServicesSearch Engine ExtensionsServer Specific ExtensionsSession ExtensionsText ProcessingVariable and Type Related ExtensionsWeb ServicesWindows Only ExtensionsXML ManipulationGUI ExtensionsKeyboard Shortcuts?This helpjNext menu itemkPrevious menu itemg pPrevious man pageg nNext man pageGScroll to bottomg gScroll to topg hGoto homepageg sGoto search(current page)/Focus search boxPHP is a popular general-purpose scripting language that is especially suited to web development.Fast, flexible and pragmatic, PHP powers everything from your blog to the most popular websites in the world.Download5.6.36·Release Notes·Upgrading7.0.30·Release Notes·Upgrading7.1.18·Release Notes·Upgrading7.2.7·Release Notes·Upgrading21 Jun 2018PHP 7.2.7 ReleasedPHP 7.2.7 Release AnnouncementThe PHP development team announces the immediate availability of PHP 7.2.7.\n", + " This is a primarily a bugfix release which includes a segfault fix for opcache.PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.7 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.21 Jun 2018PHP 7.3.0 alpha 2 ReleasedThe PHP team is glad to announce the release of the second PHP 7.3.0 version, PHP 7.3.0 Alpha 2. \n", + " The rough outline of the PHP 7.3 release cycle is specified in thePHP Wiki.For source downloads of PHP 7.3.0 Alpha 2 please visit thedownload page.\n", + " Windows sources and binaries can be found onwindows.php.net/qa/.Please carefully test this version and report any issues found in thebug reporting system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.The next release would be Alpha 3, planned for July 5.The signatures for the release can be found inthe manifestor onthe QA site.Thank you for helping us make PHP better.07 Jun 2018PHP 7.3.0 alpha 1 ReleasedPHP team is glad to announce the release of the first PHP 7.3.0 version, PHP 7.3.0 Alpha 1. \n", + "\t This starts the PHP 7.3 release cycle, the rough outline of which is specified in thePHP Wiki.For source downloads of PHP 7.3.0 Alpha 1 please visit thedownload page.Please carefully test this version and report any issues found in thebug reporting system.Please DO NOT use this version in production, it is an early test version.For more information on the new features and other changes, you can read theNEWSfile,\n", + "\t or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.The next release would be Alpha 2, planned for June 21.The signatures for the release can be found inthe manifestor onthe QA site.Thank you for helping us make PHP better.25 May 2018PHP 7.1.18 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.1.18.\n", + " \n", + " All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.18 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.24 May 2018PHP 7.2.6 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.6.\n", + " This is a primarily a bugfix release which includes a memory corruption fix for EXIF.PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.6 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 5.6.36 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 5.6.36. This is a security release. Several security bugs have been fixed\n", + " in this release.\n", + "\n", + " All PHP 5.6 users are encouraged to upgrade to this version.For source downloads of PHP 5.6.36 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 7.1.17 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.1.17. This is a security fix release, containing many bugfixes.\n", + " \n", + " All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.17 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.26 Apr 2018PHP 7.0.30 ReleasedThe PHP development team announces the immediate availability of PHP\n", + "\t7.0.30. This is a security release. Several security bugs have been fixed\n", + "\tin this release.\n", + " \n", + " All PHP 7.0 users are encouraged to upgrade to this version.For source downloads of PHP 7.0.30 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.01 Feb 2018PHP 7.2.2 ReleasedThe PHP development team announces the immediate availability of PHP\n", + " 7.2.2. This is a bugfix release, with several bug fixes included.All PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.2 please visit ourdownloads page,\n", + " Windows source and binaries can be found onwindows.php.net/download/.\n", + " The list of changes is recorded in theChangeLog.12 Oct 2017PHP 7.2.0 Release Candidate 4 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC4.\n", + " This release is the fourth Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 4 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 26th of October.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.28 Sep 2017PHP 7.2.0 Release Candidate 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC3.\n", + " This release is the third Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 12th of October.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.31 Aug 2017PHP 7.2.0 Release Candidate 1 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Release\n", + " Candidate 1. This release is the first Release Candidate for 7.2.0.\n", + " All users of PHP are encouraged to test this version carefully, and report any bugs\n", + " and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 1 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The second Release Candidate will be released on the 14th of September.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.17 Aug 2017PHP 7.2.0 Beta 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Beta 3.\n", + " This release is the third and final beta for 7.2.0. All users of PHP are encouraged\n", + " to test this version carefully, and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Beta 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found atwindows.php.net/qa/.The first Release Candidate will be released on the 31th of August.\n", + " You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.06 Jul 2017PHP 7.2.0 Alpha 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Alpha 3.\n", + " This release contains fixes and improvements relative to Alpha 2.\n", + " All users of PHP are encouraged to test this version carefully,\n", + " and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For information on new features and other changes, you can read theNEWSfile,\n", + " or theUPGRADINGfile\n", + " for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Alpha 3 please visit thedownloadpage,\n", + " Windows sources and binaries can be found onwindows.php.net/qa/.The first beta will be released on the 20th of July. You can also read the full list of planned releases on ourwiki.Thank you for helping us make PHP better.Older News EntriesConferences calling for papersphp[world] 2018Upcoming conferencesLaravelConf Taiwan 2018Mid-Atlantic Developer ConferenceInternational PHP Conference 2018 - Fall EditionCoderCruise 2018 - The Bahamas!User Group EventsSpecial ThanksSocial media@official_phpCopyright © 2001-2018 The PHP GroupMy PHP.netContactOther PHP.net sitesMirror sitesPrivacy policy\n" + ] + } + ], + "source": [ + "# Parsing the HTML Content\n", + "\n", + "\n", + "soup = BeautifulSoup(html,\"html5lib\")\n", + "text = soup.get_text(strip=True)\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic', 'syntaxTypesVariablesConstantsExpressionsOperatorsControl', 'StructuresFunctionsClasses', 'and', 'ObjectsNamespacesErrorsExceptionsGeneratorsReferences', 'ExplainedPredefined', 'VariablesPredefined', 'ExceptionsPredefined', 'Interfaces', 'and', 'ClassesContext', 'options', 'and', 'parametersSupported', 'Protocols', 'and', 'WrappersSecurityIntroductionGeneral', 'considerationsInstalled', 'as', 'CGI', 'binaryInstalled', 'as', 'an', 'Apache', 'moduleSession', 'SecurityFilesystem', 'SecurityDatabase', 'SecurityError', 'ReportingUsing', 'Register', 'GlobalsUser', 'Submitted', 'DataMagic', 'QuotesHiding', 'PHPKeeping', 'CurrentFeaturesHTTP', 'authentication', 'with', 'PHPCookiesSessionsDealing', 'with', 'XFormsHandling', 'file', 'uploadsUsing', 'remote', 'filesConnection', 'handlingPersistent', 'Database', 'ConnectionsSafe', 'ModeCommand', 'line', 'usageGarbage', 'CollectionDTrace', 'Dynamic', 'TracingFunction', 'ReferenceAffecting', \"PHP's\", 'BehaviourAudio', 'Formats', 'ManipulationAuthentication', 'ServicesCommand', 'Line', 'Specific', 'ExtensionsCompression', 'and', 'Archive', 'ExtensionsCredit', 'Card', 'ProcessingCryptography', 'ExtensionsDatabase', 'ExtensionsDate', 'and', 'Time', 'Related', 'ExtensionsFile', 'System', 'Related', 'ExtensionsHuman', 'Language', 'and', 'Character', 'Encoding', 'SupportImage', 'Processing', 'and', 'GenerationMail', 'Related', 'ExtensionsMathematical', 'ExtensionsNon-Text', 'MIME', 'OutputProcess', 'Control', 'ExtensionsOther', 'Basic', 'ExtensionsOther', 'ServicesSearch', 'Engine', 'ExtensionsServer', 'Specific', 'ExtensionsSession', 'ExtensionsText', 'ProcessingVariable', 'and', 'Type', 'Related', 'ExtensionsWeb', 'ServicesWindows', 'Only', 'ExtensionsXML', 'ManipulationGUI', 'ExtensionsKeyboard', 'Shortcuts?This', 'helpjNext', 'menu', 'itemkPrevious', 'menu', 'itemg', 'pPrevious', 'man', 'pageg', 'nNext', 'man', 'pageGScroll', 'to', 'bottomg', 'gScroll', 'to', 'topg', 'hGoto', 'homepageg', 'sGoto', 'search(current', 'page)/Focus', 'search', 'boxPHP', 'is', 'a', 'popular', 'general-purpose', 'scripting', 'language', 'that', 'is', 'especially', 'suited', 'to', 'web', 'development.Fast,', 'flexible', 'and', 'pragmatic,', 'PHP', 'powers', 'everything', 'from', 'your', 'blog', 'to', 'the', 'most', 'popular', 'websites', 'in', 'the', 'world.Download5.6.36·Release', 'Notes·Upgrading7.0.30·Release', 'Notes·Upgrading7.1.18·Release', 'Notes·Upgrading7.2.7·Release', 'Notes·Upgrading21', 'Jun', '2018PHP', '7.2.7', 'ReleasedPHP', '7.2.7', 'Release', 'AnnouncementThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.7.', 'This', 'is', 'a', 'primarily', 'a', 'bugfix', 'release', 'which', 'includes', 'a', 'segfault', 'fix', 'for', 'opcache.PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.7', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.21', 'Jun', '2018PHP', '7.3.0', 'alpha', '2', 'ReleasedThe', 'PHP', 'team', 'is', 'glad', 'to', 'announce', 'the', 'release', 'of', 'the', 'second', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '2.', 'The', 'rough', 'outline', 'of', 'the', 'PHP', '7.3', 'release', 'cycle', 'is', 'specified', 'in', 'thePHP', 'Wiki.For', 'source', 'downloads', 'of', 'PHP', '7.3.0', 'Alpha', '2', 'please', 'visit', 'thedownload', 'page.', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/qa/.Please', 'carefully', 'test', 'this', 'version', 'and', 'report', 'any', 'issues', 'found', 'in', 'thebug', 'reporting', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.The', 'next', 'release', 'would', 'be', 'Alpha', '3,', 'planned', 'for', 'July', '5.The', 'signatures', 'for', 'the', 'release', 'can', 'be', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.07', 'Jun', '2018PHP', '7.3.0', 'alpha', '1', 'ReleasedPHP', 'team', 'is', 'glad', 'to', 'announce', 'the', 'release', 'of', 'the', 'first', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '1.', 'This', 'starts', 'the', 'PHP', '7.3', 'release', 'cycle,', 'the', 'rough', 'outline', 'of', 'which', 'is', 'specified', 'in', 'thePHP', 'Wiki.For', 'source', 'downloads', 'of', 'PHP', '7.3.0', 'Alpha', '1', 'please', 'visit', 'thedownload', 'page.Please', 'carefully', 'test', 'this', 'version', 'and', 'report', 'any', 'issues', 'found', 'in', 'thebug', 'reporting', 'system.Please', 'DO', 'NOT', 'use', 'this', 'version', 'in', 'production,', 'it', 'is', 'an', 'early', 'test', 'version.For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.The', 'next', 'release', 'would', 'be', 'Alpha', '2,', 'planned', 'for', 'June', '21.The', 'signatures', 'for', 'the', 'release', 'can', 'be', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.25', 'May', '2018PHP', '7.1.18', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.18.', 'All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.18', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.24', 'May', '2018PHP', '7.2.6', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.6.', 'This', 'is', 'a', 'primarily', 'a', 'bugfix', 'release', 'which', 'includes', 'a', 'memory', 'corruption', 'fix', 'for', 'EXIF.PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.6', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '5.6.36', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '5.6.36.', 'This', 'is', 'a', 'security', 'release.', 'Several', 'security', 'bugs', 'have', 'been', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '5.6', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '5.6.36', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '7.1.17', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.17.', 'This', 'is', 'a', 'security', 'fix', 'release,', 'containing', 'many', 'bugfixes.', 'All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.17', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.26', 'Apr', '2018PHP', '7.0.30', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.0.30.', 'This', 'is', 'a', 'security', 'release.', 'Several', 'security', 'bugs', 'have', 'been', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '7.0', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.0.30', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Feb', '2018PHP', '7.2.2', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.2.', 'This', 'is', 'a', 'bugfix', 'release,', 'with', 'several', 'bug', 'fixes', 'included.All', 'PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.2', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.12', 'Oct', '2017PHP', '7.2.0', 'Release', 'Candidate', '4', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC4.', 'This', 'release', 'is', 'the', 'fourth', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '4', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '26th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.28', 'Sep', '2017PHP', '7.2.0', 'Release', 'Candidate', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC3.', 'This', 'release', 'is', 'the', 'third', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '12th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.31', 'Aug', '2017PHP', '7.2.0', 'Release', 'Candidate', '1', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1.', 'This', 'release', 'is', 'the', 'first', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'second', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '14th', 'of', 'September.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.17', 'Aug', '2017PHP', '7.2.0', 'Beta', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Beta', '3.', 'This', 'release', 'is', 'the', 'third', 'and', 'final', 'beta', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Beta', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'first', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '31th', 'of', 'August.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.06', 'Jul', '2017PHP', '7.2.0', 'Alpha', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Alpha', '3.', 'This', 'release', 'contains', 'fixes', 'and', 'improvements', 'relative', 'to', 'Alpha', '2.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'on', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Alpha', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/qa/.The', 'first', 'beta', 'will', 'be', 'released', 'on', 'the', '20th', 'of', 'July.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'on', 'ourwiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.Older', 'News', 'EntriesConferences', 'calling', 'for', 'papersphp[world]', '2018Upcoming', 'conferencesLaravelConf', 'Taiwan', '2018Mid-Atlantic', 'Developer', 'ConferenceInternational', 'PHP', 'Conference', '2018', '-', 'Fall', 'EditionCoderCruise', '2018', '-', 'The', 'Bahamas!User', 'Group', 'EventsSpecial', 'ThanksSocial', 'media@official_phpCopyright', '©', '2001-2018', 'The', 'PHP', 'GroupMy', 'PHP.netContactOther', 'PHP.net', 'sitesMirror', 'sitesPrivacy', 'policy']\n" + ] + } + ], + "source": [ + "# the main work starts....\n", + "\n", + "\n", + "tokens = [t for t in text.split()]\n", + "print(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PHP::1\n", + "Hypertext:1\n", + "PreprocessorDownloadsDocumentationGet:1\n", + "InvolvedHelpGetting:1\n", + "StartedIntroductionA:1\n", + "simple:1\n", + "tutorialLanguage:1\n", + "ReferenceBasic:1\n", + "syntaxTypesVariablesConstantsExpressionsOperatorsControl:1\n", + "StructuresFunctionsClasses:1\n", + "and:44\n", + "ObjectsNamespacesErrorsExceptionsGeneratorsReferences:1\n", + "ExplainedPredefined:1\n", + "VariablesPredefined:1\n", + "ExceptionsPredefined:1\n", + "Interfaces:1\n", + "ClassesContext:1\n", + "options:1\n", + "parametersSupported:1\n", + "Protocols:1\n", + "WrappersSecurityIntroductionGeneral:1\n", + "considerationsInstalled:1\n", + "as:2\n", + "CGI:1\n", + "binaryInstalled:1\n", + "an:2\n", + "Apache:1\n", + "moduleSession:1\n", + "SecurityFilesystem:1\n", + "SecurityDatabase:1\n", + "SecurityError:1\n", + "ReportingUsing:1\n", + "Register:1\n", + "GlobalsUser:1\n", + "Submitted:1\n", + "DataMagic:1\n", + "QuotesHiding:1\n", + "PHPKeeping:1\n", + "CurrentFeaturesHTTP:1\n", + "authentication:1\n", + "with:3\n", + "PHPCookiesSessionsDealing:1\n", + "XFormsHandling:1\n", + "file:1\n", + "uploadsUsing:1\n", + "remote:1\n", + "filesConnection:1\n", + "handlingPersistent:1\n", + "Database:1\n", + "ConnectionsSafe:1\n", + "ModeCommand:1\n", + "line:1\n", + "usageGarbage:1\n", + "CollectionDTrace:1\n", + "Dynamic:1\n", + "TracingFunction:1\n", + "ReferenceAffecting:1\n", + "PHP's:1\n", + "BehaviourAudio:1\n", + "Formats:1\n", + "ManipulationAuthentication:1\n", + "ServicesCommand:1\n", + "Line:1\n", + "Specific:2\n", + "ExtensionsCompression:1\n", + "Archive:1\n", + "ExtensionsCredit:1\n", + "Card:1\n", + "ProcessingCryptography:1\n", + "ExtensionsDatabase:1\n", + "ExtensionsDate:1\n", + "Time:1\n", + "Related:4\n", + "ExtensionsFile:1\n", + "System:1\n", + "ExtensionsHuman:1\n", + "Language:1\n", + "Character:1\n", + "Encoding:1\n", + "SupportImage:1\n", + "Processing:1\n", + "GenerationMail:1\n", + "ExtensionsMathematical:1\n", + "ExtensionsNon-Text:1\n", + "MIME:1\n", + "OutputProcess:1\n", + "Control:1\n", + "ExtensionsOther:2\n", + "Basic:1\n", + "ServicesSearch:1\n", + "Engine:1\n", + "ExtensionsServer:1\n", + "ExtensionsSession:1\n", + "ExtensionsText:1\n", + "ProcessingVariable:1\n", + "Type:1\n", + "ExtensionsWeb:1\n", + "ServicesWindows:1\n", + "Only:1\n", + "ExtensionsXML:1\n", + "ManipulationGUI:1\n", + "ExtensionsKeyboard:1\n", + "Shortcuts?This:1\n", + "helpjNext:1\n", + "menu:2\n", + "itemkPrevious:1\n", + "itemg:1\n", + "pPrevious:1\n", + "man:2\n", + "pageg:1\n", + "nNext:1\n", + "pageGScroll:1\n", + "to:26\n", + "bottomg:1\n", + "gScroll:1\n", + "topg:1\n", + "hGoto:1\n", + "homepageg:1\n", + "sGoto:1\n", + "search(current:1\n", + "page)/Focus:1\n", + "search:1\n", + "boxPHP:1\n", + "is:24\n", + "a:18\n", + "popular:2\n", + "general-purpose:1\n", + "scripting:1\n", + "language:1\n", + "that:1\n", + "especially:1\n", + "suited:1\n", + "web:1\n", + "development.Fast,:1\n", + "flexible:1\n", + "pragmatic,:1\n", + "PHP:65\n", + "powers:1\n", + "everything:1\n", + "from:1\n", + "your:1\n", + "blog:1\n", + "the:50\n", + "most:1\n", + "websites:1\n", + "in:27\n", + "world.Download5.6.36·Release:1\n", + "Notes·Upgrading7.0.30·Release:1\n", + "Notes·Upgrading7.1.18·Release:1\n", + "Notes·Upgrading7.2.7·Release:1\n", + "Notes·Upgrading21:1\n", + "Jun:3\n", + "2018PHP:9\n", + "7.2.7:3\n", + "ReleasedPHP:2\n", + "Release:15\n", + "AnnouncementThe:1\n", + "development:12\n", + "team:14\n", + "announces:12\n", + "immediate:12\n", + "availability:12\n", + "of:59\n", + "7.2.7.:1\n", + "This:12\n", + "primarily:2\n", + "bugfix:3\n", + "release:22\n", + "which:3\n", + "includes:2\n", + "segfault:1\n", + "fix:3\n", + "for:25\n", + "opcache.PHP:1\n", + "7.2:3\n", + "users:12\n", + "are:12\n", + "encouraged:12\n", + "upgrade:7\n", + "this:17\n", + "version.For:8\n", + "source:21\n", + "downloads:14\n", + "please:14\n", + "visit:14\n", + "ourdownloads:7\n", + "page,:7\n", + "Windows:13\n", + "binaries:13\n", + "can:34\n", + "be:29\n", + "found:24\n", + "onwindows.php.net/download/.:7\n", + "The:10\n", + "list:19\n", + "changes:7\n", + "recorded:7\n", + "theChangeLog.21:1\n", + "7.3.0:8\n", + "alpha:2\n", + "2:2\n", + "ReleasedThe:12\n", + "glad:2\n", + "announce:2\n", + "second:2\n", + "version,:2\n", + "Alpha:10\n", + "2.:2\n", + "rough:2\n", + "outline:2\n", + "7.3:2\n", + "cycle:1\n", + "specified:2\n", + "thePHP:2\n", + "Wiki.For:2\n", + "thedownload:2\n", + "page.:1\n", + "sources:6\n", + "onwindows.php.net/qa/.Please:1\n", + "carefully:2\n", + "test:8\n", + "version:8\n", + "report:7\n", + "any:7\n", + "issues:2\n", + "thebug:7\n", + "reporting:2\n", + "system.THIS:6\n", + "IS:6\n", + "A:6\n", + "DEVELOPMENT:6\n", + "PREVIEW:6\n", + "-:8\n", + "DO:7\n", + "NOT:7\n", + "USE:6\n", + "IT:6\n", + "IN:6\n", + "PRODUCTION!For:6\n", + "more:6\n", + "information:7\n", + "on:13\n", + "new:7\n", + "features:7\n", + "other:7\n", + "changes,:7\n", + "you:14\n", + "read:12\n", + "theNEWSfile,:7\n", + "or:7\n", + "theUPGRADINGfile:7\n", + "complete:7\n", + "upgrading:7\n", + "notes.:7\n", + "These:7\n", + "files:7\n", + "also:12\n", + "archive.The:2\n", + "next:4\n", + "would:2\n", + "3,:1\n", + "planned:7\n", + "July:1\n", + "5.The:1\n", + "signatures:2\n", + "inthe:2\n", + "manifestor:2\n", + "onthe:2\n", + "QA:2\n", + "site.Thank:2\n", + "helping:7\n", + "us:7\n", + "make:7\n", + "better.07:1\n", + "1:4\n", + "first:4\n", + "1.:2\n", + "starts:1\n", + "cycle,:1\n", + "page.Please:1\n", + "system.Please:1\n", + "use:1\n", + "production,:1\n", + "it:1\n", + "early:1\n", + "2,:1\n", + "June:1\n", + "21.The:1\n", + "better.25:1\n", + "May:2\n", + "7.1.18:2\n", + "7.1.18.:1\n", + "All:9\n", + "7.1:2\n", + "theChangeLog.24:1\n", + "7.2.6:2\n", + "7.2.6.:1\n", + "memory:1\n", + "corruption:1\n", + "EXIF.PHP:1\n", + "theChangeLog.26:3\n", + "Apr:3\n", + "5.6.36:2\n", + "5.6.36.:1\n", + "security:5\n", + "release.:4\n", + "Several:2\n", + "bugs:7\n", + "have:2\n", + "been:2\n", + "fixed:2\n", + "5.6:1\n", + "7.1.17:2\n", + "7.1.17.:1\n", + "release,:2\n", + "containing:1\n", + "many:1\n", + "bugfixes.:1\n", + "7.0.30:2\n", + "7.0.30.:1\n", + "7.0:1\n", + "theChangeLog.01:1\n", + "Feb:1\n", + "7.2.2:2\n", + "7.2.2.:1\n", + "several:1\n", + "bug:1\n", + "fixes:2\n", + "included.All:1\n", + "theChangeLog.12:1\n", + "Oct:1\n", + "2017PHP:5\n", + "7.2.0:15\n", + "Candidate:14\n", + "4:2\n", + "RC4.:1\n", + "fourth:1\n", + "7.2.0.:4\n", + "carefully,:5\n", + "incompatibilities:5\n", + "tracking:5\n", + "archive.For:5\n", + "thedownloadpage,:5\n", + "atwindows.php.net/qa/.The:4\n", + "will:5\n", + "announced:2\n", + "26th:1\n", + "October.:2\n", + "You:5\n", + "full:5\n", + "releases:5\n", + "onour:4\n", + "wiki.Thank:4\n", + "better.28:1\n", + "Sep:1\n", + "3:6\n", + "RC3.:1\n", + "third:2\n", + "12th:1\n", + "better.31:1\n", + "Aug:2\n", + "released:3\n", + "14th:1\n", + "September.:1\n", + "better.17:1\n", + "Beta:3\n", + "3.:2\n", + "final:1\n", + "beta:2\n", + "31th:1\n", + "August.:1\n", + "better.06:1\n", + "Jul:1\n", + "contains:1\n", + "improvements:1\n", + "relative:1\n", + "onwindows.php.net/qa/.The:1\n", + "20th:1\n", + "July.:1\n", + "ourwiki.Thank:1\n", + "better.Older:1\n", + "News:1\n", + "EntriesConferences:1\n", + "calling:1\n", + "papersphp[world]:1\n", + "2018Upcoming:1\n", + "conferencesLaravelConf:1\n", + "Taiwan:1\n", + "2018Mid-Atlantic:1\n", + "Developer:1\n", + "ConferenceInternational:1\n", + "Conference:1\n", + "2018:2\n", + "Fall:1\n", + "EditionCoderCruise:1\n", + "Bahamas!User:1\n", + "Group:1\n", + "EventsSpecial:1\n", + "ThanksSocial:1\n", + "media@official_phpCopyright:1\n", + "©:1\n", + "2001-2018:1\n", + "GroupMy:1\n", + "PHP.netContactOther:1\n", + "PHP.net:1\n", + "sitesMirror:1\n", + "sitesPrivacy:1\n", + "policy:1\n" + ] + } + ], + "source": [ + "# NLTK - Natural Language Tool Kit\n", + "\n", + "\n", + "freq = nltk.FreqDist(tokens)\n", + "\n", + "\n", + "# Show the frequency dictionary\n", + "\n", + "\n", + "for key,val in freq.items():\n", + " print (str(key) + ':' + str(val))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stopword list\n", + "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" + ] + } + ], + "source": [ + "# Remove stopwords - Stopword removal technique\n", + "\n", + "from nltk.corpus import stopwords\n", + "\n", + "\n", + "\n", + "# Copying an array completely\n", + "\n", + "clean_tokens = tokens[:]\n", + "\n", + "\n", + "\n", + "# Using english stopwords\n", + "\n", + "sr = stopwords.words('english')\n", + "print(\"Stopword list\")\n", + "print(sr)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic', 'syntaxTypesVariablesConstantsExpressionsOperatorsControl', 'StructuresFunctionsClasses', 'ObjectsNamespacesErrorsExceptionsGeneratorsReferences', 'ExplainedPredefined', 'VariablesPredefined', 'ExceptionsPredefined', 'Interfaces', 'ClassesContext', 'options', 'parametersSupported', 'Protocols', 'WrappersSecurityIntroductionGeneral', 'considerationsInstalled', 'CGI', 'binaryInstalled', 'Apache', 'moduleSession', 'SecurityFilesystem', 'SecurityDatabase', 'SecurityError', 'ReportingUsing', 'Register', 'GlobalsUser', 'Submitted', 'DataMagic', 'QuotesHiding', 'PHPKeeping', 'CurrentFeaturesHTTP', 'authentication', 'PHPCookiesSessionsDealing', 'XFormsHandling', 'file', 'uploadsUsing', 'remote', 'filesConnection', 'handlingPersistent', 'Database', 'ConnectionsSafe', 'ModeCommand', 'line', 'usageGarbage', 'CollectionDTrace', 'Dynamic', 'TracingFunction', 'ReferenceAffecting', \"PHP's\", 'BehaviourAudio', 'Formats', 'ManipulationAuthentication', 'ServicesCommand', 'Line', 'Specific', 'ExtensionsCompression', 'Archive', 'ExtensionsCredit', 'Card', 'ProcessingCryptography', 'ExtensionsDatabase', 'ExtensionsDate', 'Time', 'Related', 'ExtensionsFile', 'System', 'Related', 'ExtensionsHuman', 'Language', 'Character', 'Encoding', 'SupportImage', 'Processing', 'GenerationMail', 'Related', 'ExtensionsMathematical', 'ExtensionsNon-Text', 'MIME', 'OutputProcess', 'Control', 'ExtensionsOther', 'Basic', 'ExtensionsOther', 'ServicesSearch', 'Engine', 'ExtensionsServer', 'Specific', 'ExtensionsSession', 'ExtensionsText', 'ProcessingVariable', 'Type', 'Related', 'ExtensionsWeb', 'ServicesWindows', 'Only', 'ExtensionsXML', 'ManipulationGUI', 'ExtensionsKeyboard', 'Shortcuts?This', 'helpjNext', 'menu', 'itemkPrevious', 'menu', 'itemg', 'pPrevious', 'man', 'pageg', 'nNext', 'man', 'pageGScroll', 'bottomg', 'gScroll', 'topg', 'hGoto', 'homepageg', 'sGoto', 'search(current', 'page)/Focus', 'search', 'boxPHP', 'popular', 'general-purpose', 'scripting', 'language', 'especially', 'suited', 'web', 'development.Fast,', 'flexible', 'pragmatic,', 'PHP', 'powers', 'everything', 'blog', 'popular', 'websites', 'world.Download5.6.36·Release', 'Notes·Upgrading7.0.30·Release', 'Notes·Upgrading7.1.18·Release', 'Notes·Upgrading7.2.7·Release', 'Notes·Upgrading21', 'Jun', '2018PHP', '7.2.7', 'ReleasedPHP', '7.2.7', 'Release', 'AnnouncementThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.7.', 'This', 'primarily', 'bugfix', 'release', 'includes', 'segfault', 'fix', 'opcache.PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.7', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.21', 'Jun', '2018PHP', '7.3.0', 'alpha', '2', 'ReleasedThe', 'PHP', 'team', 'glad', 'announce', 'release', 'second', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '2.', 'The', 'rough', 'outline', 'PHP', '7.3', 'release', 'cycle', 'specified', 'thePHP', 'Wiki.For', 'source', 'downloads', 'PHP', '7.3.0', 'Alpha', '2', 'please', 'visit', 'thedownload', 'page.', 'Windows', 'sources', 'binaries', 'found', 'onwindows.php.net/qa/.Please', 'carefully', 'test', 'version', 'report', 'issues', 'found', 'thebug', 'reporting', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.The', 'next', 'release', 'would', 'Alpha', '3,', 'planned', 'July', '5.The', 'signatures', 'release', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'helping', 'us', 'make', 'PHP', 'better.07', 'Jun', '2018PHP', '7.3.0', 'alpha', '1', 'ReleasedPHP', 'team', 'glad', 'announce', 'release', 'first', 'PHP', '7.3.0', 'version,', 'PHP', '7.3.0', 'Alpha', '1.', 'This', 'starts', 'PHP', '7.3', 'release', 'cycle,', 'rough', 'outline', 'specified', 'thePHP', 'Wiki.For', 'source', 'downloads', 'PHP', '7.3.0', 'Alpha', '1', 'please', 'visit', 'thedownload', 'page.Please', 'carefully', 'test', 'version', 'report', 'issues', 'found', 'thebug', 'reporting', 'system.Please', 'DO', 'NOT', 'use', 'version', 'production,', 'early', 'test', 'version.For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.The', 'next', 'release', 'would', 'Alpha', '2,', 'planned', 'June', '21.The', 'signatures', 'release', 'found', 'inthe', 'manifestor', 'onthe', 'QA', 'site.Thank', 'helping', 'us', 'make', 'PHP', 'better.25', 'May', '2018PHP', '7.1.18', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.1.18.', 'All', 'PHP', '7.1', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.1.18', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.24', 'May', '2018PHP', '7.2.6', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.6.', 'This', 'primarily', 'bugfix', 'release', 'includes', 'memory', 'corruption', 'fix', 'EXIF.PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.6', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '5.6.36', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '5.6.36.', 'This', 'security', 'release.', 'Several', 'security', 'bugs', 'fixed', 'release.', 'All', 'PHP', '5.6', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '5.6.36', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '7.1.17', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.1.17.', 'This', 'security', 'fix', 'release,', 'containing', 'many', 'bugfixes.', 'All', 'PHP', '7.1', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.1.17', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.26', 'Apr', '2018PHP', '7.0.30', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.0.30.', 'This', 'security', 'release.', 'Several', 'security', 'bugs', 'fixed', 'release.', 'All', 'PHP', '7.0', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.0.30', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.01', 'Feb', '2018PHP', '7.2.2', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.2.', 'This', 'bugfix', 'release,', 'several', 'bug', 'fixes', 'included.All', 'PHP', '7.2', 'users', 'encouraged', 'upgrade', 'version.For', 'source', 'downloads', 'PHP', '7.2.2', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'binaries', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'changes', 'recorded', 'theChangeLog.12', 'Oct', '2017PHP', '7.2.0', 'Release', 'Candidate', '4', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'RC4.', 'This', 'release', 'fourth', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '4', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'announced', '26th', 'October.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.28', 'Sep', '2017PHP', '7.2.0', 'Release', 'Candidate', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'RC3.', 'This', 'release', 'third', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'announced', '12th', 'October.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.31', 'Aug', '2017PHP', '7.2.0', 'Release', 'Candidate', '1', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Release', 'Candidate', '1.', 'This', 'release', 'first', 'Release', 'Candidate', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Release', 'Candidate', '1', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'second', 'Release', 'Candidate', 'released', '14th', 'September.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.17', 'Aug', '2017PHP', '7.2.0', 'Beta', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Beta', '3.', 'This', 'release', 'third', 'final', 'beta', '7.2.0.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Beta', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'atwindows.php.net/qa/.The', 'first', 'Release', 'Candidate', 'released', '31th', 'August.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'onour', 'wiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.06', 'Jul', '2017PHP', '7.2.0', 'Alpha', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'immediate', 'availability', 'PHP', '7.2.0', 'Alpha', '3.', 'This', 'release', 'contains', 'fixes', 'improvements', 'relative', 'Alpha', '2.', 'All', 'users', 'PHP', 'encouraged', 'test', 'version', 'carefully,', 'report', 'bugs', 'incompatibilities', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'new', 'features', 'changes,', 'read', 'theNEWSfile,', 'theUPGRADINGfile', 'complete', 'list', 'upgrading', 'notes.', 'These', 'files', 'also', 'found', 'release', 'archive.For', 'source', 'downloads', 'PHP', '7.2.0', 'Alpha', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'binaries', 'found', 'onwindows.php.net/qa/.The', 'first', 'beta', 'released', '20th', 'July.', 'You', 'also', 'read', 'full', 'list', 'planned', 'releases', 'ourwiki.Thank', 'helping', 'us', 'make', 'PHP', 'better.Older', 'News', 'EntriesConferences', 'calling', 'papersphp[world]', '2018Upcoming', 'conferencesLaravelConf', 'Taiwan', '2018Mid-Atlantic', 'Developer', 'ConferenceInternational', 'PHP', 'Conference', '2018', '-', 'Fall', 'EditionCoderCruise', '2018', '-', 'The', 'Bahamas!User', 'Group', 'EventsSpecial', 'ThanksSocial', 'media@official_phpCopyright', '©', '2001-2018', 'The', 'PHP', 'GroupMy', 'PHP.netContactOther', 'PHP.net', 'sitesMirror', 'sitesPrivacy', 'policy']\n" + ] + } + ], + "source": [ + "# Iterate over all the tokens\n", + "for token in tokens:\n", + " if token in stopwords.words('english'):\n", + " clean_tokens.remove(token)\n", + "print(clean_tokens) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/arunav/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "['Hello Dr. Adam, how are you?', 'I hope everything is going well.', 'Today is a good day, see you dude.']\n", + "['Hello', 'Dr.', 'Adam', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'everything', 'is', 'going', 'well', '.', 'Today', 'is', 'a', 'good', 'day', ',', 'see', 'you', 'dude', '.']\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download('punkt')\n", + "#Tokenizing sentences\n", + "\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "mytext = \"Hello Dr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude.\"\n", + "\n", + "# Sentence tokenizer on mytext\n", + "print(sent_tokenize(mytext))\n", + "\n", + "# Word tokenizer on mytext\n", + "print(word_tokenize(mytext))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/arunav/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "Working for word PAIN:\n", + "a symptom of some physical hurt or disorder\n", + "['the patient developed severe pain and distension']\n", + "Working for word NLP:\n", + "the branch of information science that deals with natural language information\n", + "*** all definitions of python ***\n", + "large Old World boas\n", + "a soothsaying spirit or a person who is possessed by such a spirit\n", + "(Greek mythology) dragon killed by Apollo at Delphi\n" + ] + } + ], + "source": [ + "# we can get definition from Wordnet\n", + "# for that we have to import wordnet\n", + "nltk.download('wordnet')\n", + "\n", + "from nltk.corpus import wordnet\n", + "\n", + "# Start using wordnet for a particular word\n", + "\n", + "# 1st word is \"pain\"\n", + "syn = wordnet.synsets(\"pain\")\n", + "print(\"Working for word PAIN:\")\n", + "print(syn[0].definition())\n", + "print(syn[0].examples())\n", + "\n", + "#2nd word is \"NLP\"\n", + "print(\"Working for word NLP:\")\n", + "syn = wordnet.synsets(\"NLP\")\n", + "print(syn[0].definition())\n", + "\n", + "#3rd word is \"Python\"\n", + "syn = wordnet.synsets(\"Python\")\n", + "print(\"*** all definitions of python ***\")\n", + "for items in syn:\n", + " print(items.definition())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system', 'calculator', 'reckoner', 'figurer', 'estimator', 'computer']\n" + ] + } + ], + "source": [ + "# Get synonyms of words\n", + "from nltk.corpus import wordnet\n", + "synonyms = []\n", + "for syn in wordnet.synsets('Computer'):\n", + " for lemma in syn.lemmas():\n", + " synonyms.append(lemma.name())\n", + "print(synonyms)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stone\n", + "speak\n", + "bedroom\n", + "joke\n", + "lisa\n", + "purpl\n", + "univers\n", + "univers\n", + "----------------------\n", + "stone\n", + "speaking\n", + "bedroom\n", + "joke\n", + "lisa\n", + "domicile\n", + "university\n" + ] + } + ], + "source": [ + "# Stemming and Lemmatization\n", + "\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "print(stemmer.stem('stones'))\n", + "print(stemmer.stem('speaking'))\n", + "print(stemmer.stem('bedroom'))\n", + "print(stemmer.stem('jokes'))\n", + "print(stemmer.stem('lisa'))\n", + "print(stemmer.stem('purple'))\n", + "print(stemmer.stem('university'))\n", + "print(stemmer.stem('universe'))\n", + "print('----------------------')\n", + "print(lemmatizer.lemmatize('stones'))\n", + "print(lemmatizer.lemmatize('speaking'))\n", + "print(lemmatizer.lemmatize('bedroom'))\n", + "print(lemmatizer.lemmatize('jokes'))\n", + "print(lemmatizer.lemmatize('lisa'))\n", + "print(lemmatizer.lemmatize('domicile'))\n", + "print(lemmatizer.lemmatize('university'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}