-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
102 lines (79 loc) · 3.08 KB
/
main.py
File metadata and controls
102 lines (79 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from googletrans import Translator
import time
import os
import requests
import re
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://elpais.com/")
time.sleep(5)
try:
accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Aceptar')]")
accept_button.click()
time.sleep(2)
except:
print("No cookie popup found.")
try:
opinion_link = driver.find_element(By.XPATH, "//a[contains(@href, '/opinion/')]")
opinion_link.click()
time.sleep(5)
except:
print("Opinion section not found.")
articles = driver.find_elements(By.CSS_SELECTOR, "article h2 a")[:5]
os.makedirs("images", exist_ok=True)
spanish_titles = []
print("\n📰 First 5 Opinion Articles with Content and Images:\n")
for i, article in enumerate(articles, 1):
article_url = article.get_attribute("href")
driver.execute_script("window.open(arguments[0]);", article_url)
driver.switch_to.window(driver.window_handles[1])
time.sleep(5)
try:
title = driver.find_element(By.TAG_NAME, "h1").text
spanish_titles.append(title)
paragraphs = driver.find_elements(By.CSS_SELECTOR, "div[data-dtm-region='articulo_cuerpo'] p")
content = "\n".join([p.text for p in paragraphs if p.text.strip() != ""])
try:
img = driver.find_element(By.CSS_SELECTOR, "figure img")
img_url = img.get_attribute("src")
img_data = requests.get(img_url).content
img_filename = f"images/article_{i}.jpg"
with open(img_filename, "wb") as f:
f.write(img_data)
except:
img_url = "No image found"
print(f"Article {i}: {title}")
print(f"URL: {article_url}")
print(f"Image: {img_url}")
print("Content Preview:")
print(content[:500], "...\n")
except Exception as e:
print(f"Error reading article {i}: {e}")
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(2)
driver.quit()
print("\n Translated Titles:\n")
translator = Translator()
translated_titles = []
for i, title in enumerate(spanish_titles, 1):
try:
translated = translator.translate(title, src='es', dest='en')
print(f"{i}. {translated.text}")
translated_titles.append(translated.text)
except Exception as e:
print(f"{i}. Translation failed: {e}")
translated_titles.append(title)
word_count = {}
for title in translated_titles:
words = re.findall(r'\b\w+\b', title.lower())
for word in words:
word_count[word] = word_count.get(word, 0) + 1
print("\n Repeated Words in Translated Titles (appearing more than twice):\n")
for word, count in word_count.items():
if count > 2:
print(f"{word}: {count}")