diff --git a/.gitignore b/.gitignore index 066ed4a..21a6804 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ NLPIR_wrap.cxx _NLPIR.so build/ *.pyc +*.log diff --git a/demo.py b/demo.py index b262654..8f115b8 100644 --- a/demo.py +++ b/demo.py @@ -5,8 +5,8 @@ if __name__ == '__main__': - nlpir_init('.', 'UTF-8') - print nlpir_paragraph_process(r'@ICTCLAS张华平博士 应各位ICTCLAS用户的要求,张华平博士提前发布ICTCLAS2013 版本,为了与以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共享平台,从本版本开始,系统名称调整为NLPIR汉语分词系统。') - print - print nlpir_paragraph_process(r'“屌丝”这个嘲讽意味的代词迅速爆红,迎合了大众的心理和趣味。因为你会发现从表面符合屌丝定义的人,到和屌丝属性八竿子打不着的人,都在争相认领这一名号。当人人都在忙着确认自己的屌丝身份,并乐此不疲时,屌丝一词一定与时代的什么特征实现了合拍。“屌丝”不是阿Q,他们公然比惨并乐在其中有评论认为,“屌丝”是新时代的阿Q,两者并不完全相同。首先,阿Q是文学巨匠鲁迅一己之力创造的,而“屌丝”则是网络群体狂欢的结果,它是真正由网民集体创作的形象;另外,阿Q最重要的特征是“精神胜利法”,梦想的是“银盔银甲”,意淫的是“我手持钢鞭将你打”。', True) + nlpir_init('.', 'UTF-8') + print nlpir_paragraph_process(u'@ICTCLAS张华平博士 应各位ICTCLAS用户的要求,张华平博士提前发布ICTCLAS2013 版本,为了与以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共享平台,从本版本开始,系统名称调整为NLPIR汉语分词系统。'.encode('gbk')) + print + print nlpir_paragraph_process(u'“屌丝”这个嘲讽意味的代词迅速爆红,迎合了大众的心理和趣味。因为你会发现从表面符合屌丝定义的人,到和屌丝属性八竿子打不着的人,都在争相认领这一名号。当人人都在忙着确认自己的屌丝身份,并乐此不疲时,屌丝一词一定与时代的什么特征实现了合拍。“屌丝”不是阿Q,他们公然比惨并乐在其中有评论认为,“屌丝”是新时代的阿Q,两者并不完全相同。首先,阿Q是文学巨匠鲁迅一己之力创造的,而“屌丝”则是网络群体狂欢的结果,它是真正由网民集体创作的形象;另外,阿Q最重要的特征是“精神胜利法”,梦想的是“银盔银甲”,意淫的是“我手持钢鞭将你打”。'.encode('gbk'), True) nlpir_exit() diff --git a/demo_linux_utf8.py b/demo_linux_utf8.py new file mode 100644 index 0000000..a0c2aef --- /dev/null +++ b/demo_linux_utf8.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*-coding: utf-8 -*- + +from PyNLPIR import * +import sys +import locale + +reload(sys) +sys.setdefaultencoding('UTF-8') + +def p(f): + print '%s.%s(): %s' % (f.__module__, f.__name__, f()) + +# 返回使用UCS-2还是UCS-4 +print sys.maxunicode + +# 检查标准输出流的编码 +print sys.stdout.encoding + +# 返回当前系统所使用的默认字符编码 +p(sys.getdefaultencoding) + +# 返回用于转换Unicode文件名至系统文件名所使用的编码 +p(sys.getfilesystemencoding) + +# 获取默认的区域设置并返回元祖(语言, 编码) +p(locale.getdefaultlocale) + +# 返回用户设定的文本数据编码 +# 文档提到this function only returns a guess +p(locale.getpreferredencoding) + +if __name__ == '__main__': + + nlpir_init('.', 'UTF-8') + + firstTest = nlpir_paragraph_process(u'你好中国,我亲爱的祖国!GBK, GB2312, GB18030是中文的三种字符集,UCS是万国字符集!'.encode('gbk')) + print type(firstTest) + print repr(firstTest) + print firstTest + + print + + secondTest = nlpir_paragraph_process(u'编码真是但疼啊!'.encode('gbk'), True) + print type(secondTest) + print repr(secondTest) + print secondTest + + nlpir_exit() diff --git a/nlpir-ctypes/README.md b/nlpir-ctypes/README.md new file mode 100644 index 0000000..9286dd1 --- /dev/null +++ b/nlpir-ctypes/README.md @@ -0,0 +1,23 @@ +nlpir-ctypes +============ + +### Idea +This approach is inspired by this link [Python下调用NLPIR(ICTCLAS2013)的ctype做法](http://ictclas.nlpir.org/newsDetail?DocId=382) which demonstrates the Win32 platform. + +### How to hack +I made it run for Linux32 platform and as versions go by, the function symbols may need to be changed with newer `libNLPIR.so/NLPIR.dll`. + +You may hack follow this: + +* replace `nlpir = CDLL(the_library_path)` in `nlpir-ctypes.py` +* using a right tool to dump the exported NLPIR function symbols(For Linux, `nm` or `objdump` maybe. And Windows you may try `dumpbin` or [DLL Exp](http://www.nirsoft.net/utils/dll_export_viewer.html)) +* update the function symbols within the `getattr` invoke +* check the function parameters to make `fillprototype` right if necessary +* you are done:-) + +### What's next +As you can see, it should not be difficult to add linux64 and win64 support:-) + +**Note** + +Please use the right python interpreter, for `*.so` under Linux and for `*.dll` under Windows, or `CDLL` will fail. diff --git a/nlpir-ctypes/linux32/nlpir-ctypes.py b/nlpir-ctypes/linux32/nlpir-ctypes.py new file mode 100644 index 0000000..600fa38 --- /dev/null +++ b/nlpir-ctypes/linux32/nlpir-ctypes.py @@ -0,0 +1,92 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from ctypes import * +import codecs +import sys + +reload(sys) +sys.setdefaultencoding('UTF-8') +print sys.getdefaultencoding() + +nlpir = CDLL('../../linux32/libNLPIR.so') + +print type(nlpir) +print nlpir + + + +def fillprototype(f, restype, argtypes): + f.restype = restype + f.argtypes = argtypes + +MY_NLPIR_Init = getattr(nlpir, '_Z10NLPIR_InitPKciS0_') +MY_NLPIR_Exit = getattr(nlpir, '_Z10NLPIR_Exitv') +MY_NLPIR_ParagraphProcess = getattr(nlpir, '_Z22NLPIR_ParagraphProcessPKci') +MY_NLPIR_ImportUserDict = getattr(nlpir, '_Z20NLPIR_ImportUserDictPKc') +MY_NLPIR_FileProcess = getattr(nlpir, '_ZN6CNLPIR11FileProcessEPKcS1_i') +MY_NLPIR_AddUserWord = getattr(nlpir, '_Z17NLPIR_AddUserWordPKc') +MY_NLPIR_SaveTheUsrDic = getattr(nlpir, '_Z19NLPIR_SaveTheUsrDicv') +MY_NLPIR_DelUsrWord = getattr(nlpir, '_Z16NLPIR_DelUsrWordPKc') +MY_NLPIR_GetKeyWords = getattr(nlpir, '_Z17NLPIR_GetKeyWordsPKcib') +MY_NLPIR_GetFileKeyWords = getattr(nlpir, '_Z21NLPIR_GetFileKeyWordsPKcib') +MY_NLPIR_GetNewWords = getattr(nlpir, '_Z17NLPIR_GetNewWordsPKcib') +MY_NLPIR_GetFileNewWords = getattr(nlpir, '_ZN6CNLPIR15GetFileNewWordsEPKcib') +MY_NLPIR_SetPOSmap = getattr(nlpir, '_Z15NLPIR_SetPOSmapi') +MY_NLPIR_FingerPrint = getattr(nlpir, '_Z17NLPIR_FingerPrintPKc') +# New Word Identification +MY_NLPIR_NWI_Start = getattr(nlpir, '_Z15NLPIR_NWI_Startv') +MY_NLPIR_NWI_AddFile = getattr(nlpir, '_Z17NLPIR_NWI_AddFilePKc') +MY_NLPIR_NWI_AddMem = getattr(nlpir, '_Z16NLPIR_NWI_AddMemPKc') +MY_NLPIR_NWI_Complete = getattr(nlpir, '_Z18NLPIR_NWI_Completev') +MY_NLPIR_NWI_GetResult = getattr(nlpir, '_Z19NLPIR_NWI_GetResultb') +MY_NLPIR_NWI_Result2UserDict = getattr(nlpir, '_Z25NLPIR_NWI_Result2UserDictv') + +fillprototype(MY_NLPIR_Init, c_bool, [c_char_p, c_int]) +fillprototype(MY_NLPIR_Exit, c_bool, None) +fillprototype(MY_NLPIR_ParagraphProcess, c_char_p, [c_char_p, c_int]) +fillprototype(MY_NLPIR_ImportUserDict, c_uint, [c_char_p]) +fillprototype(MY_NLPIR_FileProcess, c_double, [c_char_p, c_char_p, c_int]) +fillprototype(MY_NLPIR_AddUserWord, c_int, [c_char_p]) +fillprototype(MY_NLPIR_SaveTheUsrDic, c_int, None) +fillprototype(MY_NLPIR_DelUsrWord, c_int, [c_char_p]) +fillprototype(MY_NLPIR_GetKeyWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetFileKeyWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetNewWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetFileNewWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_SetPOSmap, c_int, [c_int]) +fillprototype(MY_NLPIR_FingerPrint, c_ulong, [c_char_p]) +# New Word Identification +fillprototype(MY_NLPIR_NWI_Start, c_bool, None) +fillprototype(MY_NLPIR_NWI_AddFile, c_bool, [c_char_p]) +fillprototype(MY_NLPIR_NWI_AddMem, c_bool, [c_char_p]) +fillprototype(MY_NLPIR_NWI_Complete, c_bool, None) +fillprototype(MY_NLPIR_NWI_GetResult, c_char_p, [c_int]) +fillprototype(MY_NLPIR_NWI_Result2UserDict, c_uint, None) + +look_gb = codecs.lookup('gb2312') +look_utf = codecs.lookup('utf-8') + +if not MY_NLPIR_Init('../../', 1): + print 'NLPIR Initial failed!' + exit() + +sentence = u"我爱我的祖国,亲爱的祖国!" +print type(sentence) +print sentence + +result = MY_NLPIR_ParagraphProcess(sentence.encode('gb2312'), c_int(1)) +print result + +result_unicode = look_utf.decode(result)[0] +print result_unicode + +result_gb2312 = look_gb.encode(result_unicode)[0] +print result_gb2312 + +result_gbk = look_gb.decode(result_gb2312)[0] +print result_gbk + +MY_NLPIR_Exit() + +print 'Goodbye!' diff --git a/nlpir-ctypes/win32/nlpir-ctypes.py b/nlpir-ctypes/win32/nlpir-ctypes.py new file mode 100644 index 0000000..756d188 --- /dev/null +++ b/nlpir-ctypes/win32/nlpir-ctypes.py @@ -0,0 +1,92 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from ctypes import * +import codecs +import sys + +reload(sys) +sys.setdefaultencoding('UTF-8') +print sys.getdefaultencoding() + +nlpir = CDLL('../../win32/NLPIR.dll') + +print type(nlpir) +print nlpir + + + +def fillprototype(f, restype, argtypes): + f.restype = restype + f.argtypes = argtypes + +MY_NLPIR_Init = getattr(dll, '?NLPIR_Init@@YA_NPBDH@Z') +MY_NLPIR_Exit = getattr(dll, '?NLPIR_Exit@@YA_NXZ') +MY_NLPIR_ParagraphProcess = getattr(dll, '?NLPIR_ParagraphProcess@@YAPBDPBDH@Z') +MY_NLPIR_ImportUserDict = getattr(dll, '?NLPIR_ImportUserDict@@YAIPBD@Z') +MY_NLPIR_FileProcess = getattr(dll, '?NLPIR_FileProcess@@YANPBD0H@Z') +MY_NLPIR_AddUserWord = getattr(dll, '?NLPIR_AddUserWord@@YAHPBD@Z') +MY_NLPIR_SaveTheUsrDic = getattr(dll, '?NLPIR_SaveTheUsrDic@@YAHXZ') +MY_NLPIR_DelUsrWord = getattr(dll, '?NLPIR_DelUsrWord@@YAHPBD@Z') +MY_NLPIR_GetKeyWords = getattr(dll, '?NLPIR_GetKeyWords@@YAPBDPBDH_N@Z') +MY_NLPIR_GetFileKeyWords = getattr(dll, '?NLPIR_GetFileKeyWords@@YAPBDPBDH_N@Z') +MY_NLPIR_GetNewWords = getattr(dll, '?NLPIR_GetNewWords@@YAPBDPBDH_N@Z') +MY_NLPIR_GetFileNewWords = getattr(dll, '?NLPIR_GetFileNewWords@@YAPBDPBDH_N@Z') +MY_NLPIR_SetPOSmap = getattr(dll, '?NLPIR_SetPOSmap@@YAHH@Z') +MY_NLPIR_FingerPrint = getattr(dll, '?NLPIR_FingerPrint@@YAKPBD@Z') +# New Word Identification +MY_NLPIR_NWI_Start = getattr(dll, '?NLPIR_NWI_Start@@YA_NXZ') +MY_NLPIR_NWI_AddFile = getattr(dll, '?NLPIR_NWI_AddFile@@YAHPBD@Z') +MY_NLPIR_NWI_AddMem = getattr(dll, '?NLPIR_NWI_AddMem@@YA_NPBD@Z') +MY_NLPIR_NWI_Complete = getattr(dll, '?NLPIR_NWI_Complete@@YA_NXZ') +MY_NLPIR_NWI_GetResult = getattr(dll, '?NLPIR_NWI_GetResult@@YAPBD_N@Z') +MY_NLPIR_NWI_Result2UserDict = getattr(dll, '?NLPIR_NWI_Result2UserDict@@YAIXZ') + +fillprototype(MY_NLPIR_Init, c_bool, [c_char_p, c_int]) +fillprototype(MY_NLPIR_Exit, c_bool, None) +fillprototype(MY_NLPIR_ParagraphProcess, c_char_p, [c_char_p, c_int]) +fillprototype(MY_NLPIR_ImportUserDict, c_uint, [c_char_p]) +fillprototype(MY_NLPIR_FileProcess, c_double, [c_char_p, c_char_p, c_int]) +fillprototype(MY_NLPIR_AddUserWord, c_int, [c_char_p]) +fillprototype(MY_NLPIR_SaveTheUsrDic, c_int, None) +fillprototype(MY_NLPIR_DelUsrWord, c_int, [c_char_p]) +fillprototype(MY_NLPIR_GetKeyWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetFileKeyWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetNewWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_GetFileNewWords, c_char_p, [c_char_p, c_int, c_bool]) +fillprototype(MY_NLPIR_SetPOSmap, c_int, [c_int]) +fillprototype(MY_NLPIR_FingerPrint, c_ulong, [c_char_p]) +# New Word Identification +fillprototype(MY_NLPIR_NWI_Start, c_bool, None) +fillprototype(MY_NLPIR_NWI_AddFile, c_bool, [c_char_p]) +fillprototype(MY_NLPIR_NWI_AddMem, c_bool, [c_char_p]) +fillprototype(MY_NLPIR_NWI_Complete, c_bool, None) +fillprototype(MY_NLPIR_NWI_GetResult, c_char_p, [c_int]) +fillprototype(MY_NLPIR_NWI_Result2UserDict, c_uint, None) + +look_gb = codecs.lookup('gb2312') +look_utf = codecs.lookup('utf-8') + +if not MY_NLPIR_Init('../../', 1): + print 'NLPIR Initial failed!' + exit() + +sentence = u"我爱我的祖国,亲爱的祖国!" +print type(sentence) +print sentence + +result = MY_NLPIR_ParagraphProcess(sentence.encode('gb2312'), c_int(1)) +print result + +result_unicode = look_utf.decode(result)[0] +print result_unicode + +result_gb2312 = look_gb.encode(result_unicode)[0] +print result_gb2312 + +result_gbk = look_gb.decode(result_gb2312)[0] +print result_gbk + +MY_NLPIR_Exit() + +print 'Goodbye!'