Skip to content

Commit ebef1f7

Browse files
authored
Merge pull request #13 from WhiteCatFly/master
add summer
2 parents ec49773 + 4d744f6 commit ebef1f7

887 files changed

Lines changed: 100200 additions & 18 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.DS_Store

10 KB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
普通搜索可翻译出带有平行语料的古文,代码做了加强,勾选suggestion可翻译非平行语料
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import time
3+
import glob
4+
import json
5+
import jieba
6+
import pickle
7+
import gensim.models
8+
import numpy as np
9+
from scipy.linalg import norm
10+
11+
12+
model_file = '/home/zht/桌面/Models/word_model.mod'
13+
model = gensim.models.KeyedVectors.load(model_file)
14+
15+
16+
with open('/home/zht/桌面/Models/paraData.pk', 'rb') as f:
17+
paradatas = pickle.load(f)
18+
paradatas = [data for data in paradatas if len(data['fanyi']) > 2 or len(data['shangxi']) > 2]
19+
20+
def sentence_vector(s):
21+
words = jieba.lcut(s)
22+
v = np.zeros(100)
23+
if not words:
24+
return v
25+
for word in words:
26+
if word not in model:
27+
continue
28+
v += model[word]
29+
return v / norm(v) if norm(v) else v
30+
31+
def creatKd(data):
32+
texts = data['fanyi'] + data['shangxi']
33+
vecs = [sentence_vector(s) for s in texts.split()]
34+
n = len(vecs)
35+
return KDTree(np.array(vecs).reshape((n,100)))
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env python
2+
"""Django's command-line utility for administrative tasks."""
3+
import os
4+
import sys
5+
6+
7+
def main():
8+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'poemsearch.settings')
9+
try:
10+
from django.core.management import execute_from_command_line
11+
except ImportError as exc:
12+
raise ImportError(
13+
"Couldn't import Django. Are you sure it's installed and "
14+
"available on your PYTHONPATH environment variable? Did you "
15+
"forget to activate a virtual environment?"
16+
) from exc
17+
execute_from_command_line(sys.argv)
18+
19+
20+
if __name__ == '__main__':
21+
main()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from django.shortcuts import render
4+
from django.views.decorators import csrf
5+
6+
def post_html(request):
7+
# 不能和get一样使用render_to_response必须使用render进行重定向,不然服务端不会设置csrf_token
8+
# return render_to_response('post.html')
9+
return render(request, 'post.html')
10+
11+
def search_post(request):
12+
context = {}
13+
# 通过request.GET['name']形式获取post表单内容
14+
# result为重定向到的result.html所使用的变量
15+
if request.POST:
16+
context['result'] = [['1','2'],['3','4'],['5','6'],['1','2'],['3','4'],['5','6'],['1','2'],['3','4'],['5','6']]
17+
return render(request, 'post.html', context)
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
Django settings for poemsearch project.
3+
4+
Generated by 'django-admin startproject' using Django 2.2.2.
5+
6+
For more information on this file, see
7+
https://docs.djangoproject.com/en/2.2/topics/settings/
8+
9+
For the full list of settings and their values, see
10+
https://docs.djangoproject.com/en/2.2/ref/settings/
11+
"""
12+
13+
import os
14+
15+
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17+
18+
19+
# Quick-start development settings - unsuitable for production
20+
# See https://docs.djangoproject.com/en/2.2/howto/deployment/checklist/
21+
22+
# SECURITY WARNING: keep the secret key used in production secret!
23+
SECRET_KEY = '69uskgz-r*l8#ks$=ck@hq_z5(jgk=%#gmy9voahr7azl#d$ms'
24+
25+
# SECURITY WARNING: don't run with debug turned on in production!
26+
DEBUG = True
27+
28+
ALLOWED_HOSTS = ["127.0.0.1",'*']
29+
30+
31+
# Application definition
32+
33+
INSTALLED_APPS = [
34+
'django.contrib.admin',
35+
'django.contrib.auth',
36+
'django.contrib.contenttypes',
37+
'django.contrib.sessions',
38+
'django.contrib.messages',
39+
'django.contrib.staticfiles',
40+
]
41+
42+
MIDDLEWARE = [
43+
'django.middleware.security.SecurityMiddleware',
44+
'django.contrib.sessions.middleware.SessionMiddleware',
45+
'django.middleware.common.CommonMiddleware',
46+
'django.middleware.csrf.CsrfViewMiddleware',
47+
'django.contrib.auth.middleware.AuthenticationMiddleware',
48+
'django.contrib.messages.middleware.MessageMiddleware',
49+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
50+
]
51+
52+
ROOT_URLCONF = 'poemsearch.urls'
53+
54+
TEMPLATES = [
55+
{
56+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
57+
'DIRS': [os.path.join(BASE_DIR, 'poemsearch/templates')],
58+
'APP_DIRS': True,
59+
'OPTIONS': {
60+
'context_processors': [
61+
'django.template.context_processors.debug',
62+
'django.template.context_processors.request',
63+
'django.contrib.auth.context_processors.auth',
64+
'django.contrib.messages.context_processors.messages',
65+
],
66+
},
67+
},
68+
]
69+
70+
WSGI_APPLICATION = 'poemsearch.wsgi.application'
71+
72+
73+
# Database
74+
# https://docs.djangoproject.com/en/2.2/ref/settings/#databases
75+
76+
DATABASES = {
77+
'default': {
78+
'ENGINE': 'django.db.backends.sqlite3',
79+
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
80+
}
81+
}
82+
83+
84+
# Password validation
85+
# https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators
86+
87+
AUTH_PASSWORD_VALIDATORS = [
88+
{
89+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
90+
},
91+
{
92+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
93+
},
94+
{
95+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
96+
},
97+
{
98+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
99+
},
100+
]
101+
102+
103+
# Internationalization
104+
# https://docs.djangoproject.com/en/2.2/topics/i18n/
105+
106+
LANGUAGE_CODE = 'en-us'
107+
108+
TIME_ZONE = 'UTC'
109+
110+
USE_I18N = True
111+
112+
USE_L10N = True
113+
114+
USE_TZ = True
115+
116+
117+
# Static files (CSS, JavaScript, Images)
118+
# https://docs.djangoproject.com/en/2.2/howto/static-files/
119+
120+
STATIC_URL = '/static/'
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
import time
3+
import glob
4+
import json
5+
import jieba
6+
import pickle
7+
import gensim.models
8+
import numpy as np
9+
from scipy.linalg import norm
10+
from sklearn.svm import SVC
11+
12+
model_file = '/home/zht/桌面/Models/word_model.mod'
13+
model = gensim.models.KeyedVectors.load(model_file)
14+
15+
singleword_file = '/home/zht/桌面/poemsearch/poemsearch/singleword.model'
16+
singleword_model = gensim.models.KeyedVectors.load(singleword_file)
17+
18+
paradatas = []
19+
nonparadatas = []
20+
21+
paradatas_vecs = []
22+
nonparadatas_vecs = []
23+
24+
#Load data from cache
25+
with open('/home/zht/桌面/poemsearch/finalKdtree.pk', 'rb') as f:
26+
paradatas = pickle.load(f)
27+
28+
with open('/home/zht/桌面/poemsearch/poemsearch/paradatas_vecs.pk','rb') as f:
29+
paradatas_vecs = pickle.load(f)
30+
31+
with open('/home/zht/桌面/Models/nonparaData.pk','rb') as f:
32+
nonparadatas = pickle.load(f)
33+
34+
with open('/home/zht/桌面/poemsearch/poemsearch/nonparadatas_vecs.pk','rb') as f:
35+
nonparadatas_vecs = pickle.load(f)
36+
37+
38+
def sentence_vector(s):
39+
words = jieba.lcut(s)
40+
v = np.zeros(100)
41+
for word in words:
42+
if word not in model:
43+
continue
44+
v += model[word]
45+
return v / norm(v) if norm(v) else v
46+
47+
48+
49+
def search(target = '夏天的荷花'):
50+
'''返回在平行语料库中各首诗歌的得分,序号 (递增)'''
51+
def getscore(target, kdtree, n = 3):
52+
'''返回KDTree中前N近的距离之和'''
53+
scores, indexs = kdtree.query(target, n)
54+
return sum(scores)
55+
target_vec = sentence_vector(target)
56+
poemScores = [getscore(target_vec, data['kd']) for data in paradatas]
57+
return sorted(enumerate(poemScores), key = lambda x: x[1])
58+
59+
60+
def show(n, indexs):
61+
res = []
62+
n = n if n < 10 else 10
63+
for index, sco in indexs[:n]:
64+
res.append((paradatas[index]['name'], paradatas[index]['content']))
65+
return res
66+
67+
def svmSearch(n,indexs):
68+
res = []
69+
ps = indexs
70+
tags = [1]*3+[0]*20
71+
toTrain = [paradatas_vecs[index] for index, val in ps[:3]]
72+
toTrain += [paradatas_vecs[index] for index, val in ps[-20:]]
73+
clf = SVC(gamma='auto', probability=True)
74+
clf.fit(toTrain, tags)
75+
pres = clf.predict_proba(nonparadatas_vecs)
76+
pres = [(i,p[0])for i,p in enumerate(pres)]
77+
pres = sorted(pres, key = lambda x:x[1])
78+
n = n if 0 <= n and n < 10 else 10
79+
for index, p in pres[:n]:
80+
res.append((nonparadatas[index]['name'],nonparadatas[index]['content']))
81+
return res
82+
83+
def findFirstMoreVal(array, key):
84+
'''
85+
返回一个单调递增的数组中第一个大于key的值的索引,[lo, hi)
86+
'''
87+
lo, hi = 0, len(array)
88+
while hi > lo:
89+
mid = (lo+hi) // 2
90+
if array[mid] > key:
91+
hi = mid - 1
92+
else:
93+
lo = mid + 1
94+
return hi
95+
96+
def m3sigma(ps):
97+
scores = [data[1] for data in ps[:-100]]
98+
m = np.mean(scores)
99+
sigma = np.std(scores)
100+
return m-3*sigma
101+
102+
103+
104+
105+
106+
def Sentence2VectorBySinglewordModel(sentence):
107+
'''
108+
parameters: sentence
109+
return: a 100 dimension vector that present that string (using singleword_model)
110+
'''
111+
new_s = ''.join(sentence.split())
112+
v = np.zeros(100)
113+
for word in new_s:
114+
if word not in singleword_model:
115+
continue
116+
v += singleword_model[word]
117+
return v / norm(v) if norm(v) else v
118+
119+
120+
121+
def updateKDTree(item, vectors):
122+
'''update KDTree of item'''
123+
from scipy.spatial import KDTree
124+
kddata = item['kd'].data
125+
for vec in vectors:
126+
kddata = np.vstack((kddata,vec))
127+
item['kd'] = KDTree(kddata)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""poemsearch URL Configuration
2+
3+
The `urlpatterns` list routes URLs to views. For more information please see:
4+
https://docs.djangoproject.com/en/2.2/topics/http/urls/
5+
Examples:
6+
Function views
7+
1. Add an import: from my_app import views
8+
2. Add a URL to urlpatterns: path('', views.home, name='home')
9+
Class-based views
10+
1. Add an import: from other_app.views import Home
11+
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
12+
Including another URLconf
13+
1. Import the include() function: from django.urls import include, path
14+
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
15+
"""
16+
from django.contrib import admin
17+
from django.urls import path
18+
from django.conf.urls import url
19+
from django.views.static import serve
20+
from . import view, search2
21+
22+
urlpatterns = [
23+
path('admin/', admin.site.urls),
24+
# url(r'^hello$', view.hello),
25+
url(r'^get\.html$', view.get_html),
26+
url(r'^get$', view.get),
27+
url(r'^search-post',view.search_post),
28+
url(r'^book/(?P<path>.*)$', serve, {'document_root': '/home/zht/桌面/poemsearch/poemsearch/templates'})
29+
]

0 commit comments

Comments
 (0)