-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTitleCrawl.py
More file actions
33 lines (29 loc) · 1.25 KB
/
TitleCrawl.py
File metadata and controls
33 lines (29 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
es = Elasticsearch(
hosts=['http://es.backpackbang.com:9200'],
timeout=30, max_retries=2, retry_on_timeout=True
)
cursor = scan(es,
query={"_source": ["title", "dimensions"], "query": {"match_all": {}}},
index="products",
doc_type="amazon"
)
with open('data/asin+title+dimension.txt', 'w') as f:
for i, doc in enumerate(cursor):
if doc["_source"].get('dimensions', None) is not None and doc["_source"].get('title') is not None:
tmp = doc["_source"]['dimensions']
if tmp.get('length', None) is not None and tmp.get('width', None) is not None and tmp.get('weight',
None) is not None and tmp.get('height', None) is not None:
res = dict()
res['asin'] = doc['_id']
res['title'] = doc['_source'].get('title', '')
res['length'] = tmp['length']
res['width'] = tmp['width']
res['height'] = tmp['height']
res['weight'] = tmp['weight']
json.dump(res, f)
f.write("\n")
if i % 1000 == 0 and i:
print('done:', i)