-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
61 lines (56 loc) · 2.55 KB
/
model.py
File metadata and controls
61 lines (56 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
# coding=utf-8
import requests
from bs4 import BeautifulSoup
from app import app
from utils.langconv import *
def get_google_result(keyword, pn):
params = {'q': keyword,
"start": (pn - 1) * 10,
'gws_rd': "ssl"}
html = requests.get(app.config['GOOGLE_SEARCH_URL'],
params=params,
timeout=10,
headers=app.config['USER_AGENT']).content.decode('utf-8', 'ignore')
soup = BeautifulSoup(html, "lxml")
find_result_sum_re = re.compile(r'<div\s*id="resultStats">\D*([\d,]*).*<nobr>')
find_result_time_re = re.compile(r'<nobr>\D*([\d.]*)\D*</nobr>')
result_info = soup.find("div", id="resultStats")
result_sum = find_result_sum_re.findall(str(result_info))[0]
result_time = find_result_time_re.findall(str(result_info))[0]
f = soup.find_all("div", class_="rc")
find_abstract_re = re.compile(r'<span\s+class="st">(.+?)</span></?div')
find_cite_re = re.compile(r'<cite\s+class="_Rm.*">(.+?)</cite>')
l = [{'title': Converter('zh-hans').convert(b.h3.get_text()),
'link': b.h3.a.get('href'),
'cite': find_cite_re.findall(str(b))[0].decode('utf-8', 'ignore'),
'abstract': Converter('zh-hans').convert(find_abstract_re.findall(str(b))[0].decode('utf-8', 'ignore'))} for b
in f if
find_abstract_re.findall(str(b))]
return {
'total': result_sum,
'took': result_time,
'item': l
}
def get_zhihu_result(keyword):
zhihu_result = []
p = {
'query': keyword,
'ie': 'utf-8'
}
try:
html = requests.get(app.config['ZHIHU_SEARCH_URL'],
params=p,
timeout=5,
headers=app.config['USER_AGENT']).content.decode('utf-8', 'ignore')
r = BeautifulSoup(html, "lxml").find_all('div', class_='result-about-list')[:6]
zhihu_result = [{'title': x.a.text,
'link': x.h4.a.get('href'),
'thumb': x.find('span', class_='about-img').img.get('src')
if x.find('span', class_='about-img') is not None
else app.config['ZHIHU_DEFAULT_THUMBNAIL'],
'author': u'知乎用户' if x.p.a is None else x.p.a.text,
'like': x.find('span', class_='count').text} for x in r]
except requests.ConnectionError, requests.ConnectTimeout:
pass
return zhihu_result