-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
71 lines (61 loc) · 2.53 KB
/
utils.py
File metadata and controls
71 lines (61 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import time
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
def get_review_links(count: int) -> int:
for n in range(1, 6):
if not os.path.isdir(f"mark{n}"):
os.mkdir(f"mark{n}")
for i in range(2, int(count / 20 + 2)):
f = open(os.path.join(f"mark{n}", f"mark{n}.txt"), "a", encoding="utf-8")
response = requests.get(
f"https://otzovik.com/reviews/online_fashion_shop_wildberries_ru/{str(i)}/?ratio={str(n)}",
headers=headers,
timeout=100,
)
soup = BeautifulSoup(response.text, "html.parser")
if "С Вашего IP-адреса было много обращений к сайту Отзовик." in str(soup):
print("Problems")
time.sleep(3600)
else:
for link in soup.find_all("a", class_="review-title"):
print(f"{link.get('href')}")
f.write(f"https://otzovik.com{link.get('href')}")
print(i)
f.close()
time.sleep(45)
return count
def load_review(count: int, start: int = 0) -> None:
for n in range(1, 6):
f = open(os.path.join(f"mark{n}", f"mark{n}.txt"), "r", encoding="utf-8")
if not os.path.isdir(os.path.join(f"dataset", f"{n}")):
os.mkdir(os.path.join(f"dataset", f"{n}"))
links = f.read()
links = links.split()
f.close()
for k in range(start, count):
url = str(links[k])
response = requests.get(url, headers=headers, timeout=100)
soup = BeautifulSoup(response.text, "html.parser")
if "С Вашего IP-адреса было много обращений к сайту Отзовик." in str(soup):
print("Problems")
time.sleep(3600)
else:
print(f"Обработка:{links[k]} номер:{k}")
a = soup.find("div", itemprop="description").text
print(a)
file = open(
os.path.join("dataset", f"{n}", f"{k:04}.txt"),
"w",
encoding="utf-8",
)
file.write(str(a))
file.close()
time.sleep(45)
def main() -> None:
load_review(get_review_links(1000))
if __name__ == '__main__':
main()