-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
124 lines (99 loc) · 3.77 KB
/
scraper.py
File metadata and controls
124 lines (99 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from serpapi import GoogleSearch
import requests
import logging
import sys, os
from bs4 import BeautifulSoup
import re
# from dotenv import load_dotenv
import openai
# load_dotenv()
headers = {'Cache-Control': 'no-cache', 'Content-Type': 'application/json'}
params = {'token': os.environ.get("BROWSERLESS_API_KEY")}
openai.api_key = os.environ.get("OPENAI_API_KEY")
def scrape_website_text(url: str) -> str:
"""Scrape the website text."""
# Get the company name
company_name = re.search(r'(?<=www\.)\w+', url).group(0)
targets = {
"url": url,
"elements": [
{"selector": "body"}
]
}
response = requests.post("https://chrome.browserless.io/scrape", params=params, headers=headers, json=targets)
# Returns the main text on the webpage
resp = response.json()
webpage_text = resp['data'][0]['results'][0]['text']
# prompt = """
# You are a sustainability expert. You are given a product desciption and asked to pick out
# words that mark the products overall sustainability. For example, the material the product
# is made of, the manufacturing process, the packaging, the company that makes it, any
# certifications the product has, etc.
# Please pick out the words that mark the products overall sustainability:
# {}
# """.format(webpage_text[0:2500])
find_price = r"\$[^\s]*"
price = re.search(find_price, webpage_text).group(0)
# completion = openai.Completion.create(
# model="text-davinci-003",
# prompt=prompt,
# temperature=0.7,
# max_tokens=60,
# top_p=1.0,
# frequency_penalty=0.0,
# presence_penalty=1
# )
webpage_info = {
"brand": company_name,
"website_text": webpage_text[:2500],
"price": price
}
return webpage_info
def get_imgs(url: str) -> str:
"""Get images from website."""
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(resp.text, 'html.parser')
images = soup.find_all('img')
consolidated = []
for image in images:
alt_text = image.get("alt")
img_src = image.get("src")
if alt_text is not None:
consolidated.append([alt_text, img_src])
# Return the last image
return consolidated[-1]
def sustainability_search(query: str, location: str = "Canada"):
"""Search sustainability on google.
Location is set to Canada by default.
"""
search = GoogleSearch({
"q": query,
"location": location,
"api_key": os.environ.get("SERP_API_KEY")
})
results = search.get_dict()
# Get the relevant data
imgs = results['inline_images']
full_product = results['immersive_products']
organic_results = results['organic_results']
# Format the data
formatted_results = {}
num_items = min(len(imgs), len(full_product), len(organic_results))
i = 0
while i < num_items:
formatted_results[i] = {}
formatted_results[i]["brand"] = imgs[i]['source_name']
formatted_results[i]["img"] = imgs[i]['original']
formatted_results[i]["title"] = imgs[i]['title']
formatted_results[i]["url"] = imgs[i]['source']
formatted_results[i]["price"] = full_product[i]['price']
formatted_results[i]["description"] = organic_results[i]['snippet']
i += 1
return formatted_results
if __name__ == "__main__":
# sample = "https://us.louisvuitton.com/eng-us/products/thistle-embroidered-wavy-denim-jacket-nvprod4160010v/1AB517"
# sample = "https://www.nike.com/ca/t/fc-football-tracksuit-wB5QDv/DC9065-010"
# print(scrape_website_text(sample))
# print(get_imgs(sample))
# print(sustainability_search("black square-neck long-sleeve bodysuit"))
pass