mofangge/splitinfo.py at master · DataCluster/mofangge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import DGStorage as DG;
import urllib.parse;
a=DG.DGStorage();
type='biology';
a.select(type);
b=DG.DGStorage();
b.select('knowledgebase');
ok=False;
i=0;
while ok==False:
	res=a.fetch(20,(i-1)*20);
	i+=1;
	if len(res)==0:
		ok=True;
	content=[];
	for item in res:
		split=item["content"].split('\n');
		split=split[38].split('●');
		#print(len(split));
		if len(split)==31: #超过一页了，最后一项是分页
			split=split[1:-1];
			split=split[0:4]; #要不然题太多了
			for element in split:
				element=element.split('</a>')[0];
				#print(element.find('.html">'));
				url=element[element.find('<a href=')+9:element.find('.html">')+5];
				element=element[element.find('.html">')+7:element.find('</a>')-1];
				#print(element);
				content.append(element);
				b.add(url,'',{"content":element,"type":type,"kbname":item["prop"]["name"],"kb":item["uid"]});
				print('add '+item["uid"]);
		else:
			split=split[1:];
			split=split[0:4]; #要不然题太多了
			for element in split:
				element=element.split('</a>')[0];
				#print(element.find('.html">'));
				url=element[element.find('<a href=')+9:element.find('.html">')+5];
				element=element[element.find('.html">')+7:element.find('</a>')-1];
				#print(element);
				content.append(element);
				b.add(url,'',{"content":element,"type":type,"kbname":item["prop"]["name"],"kb":item["uid"]});
				print('add '+item["uid"]);
		string='';
		for element in content:
			element=urllib.parse.quote_plus(element);
			if string!='':
				string=string+','+str(element);
			else:
				string=str(element);
		#a.setprop(item["uid"],"content",string);
		#print('set content '+item["uid"]);