BrowserStack-Assignment/main.py at main · axshiish/BrowserStack-Assignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from googletrans import Translator
import time
import os
import requests
import re

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

driver.get("https://elpais.com/")
time.sleep(5)

try:
    accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Aceptar')]")
    accept_button.click()
    time.sleep(2)
except:
    print("No cookie popup found.")

try:
    opinion_link = driver.find_element(By.XPATH, "//a[contains(@href, '/opinion/')]")
    opinion_link.click()
    time.sleep(5)
except:
    print("Opinion section not found.")

articles = driver.find_elements(By.CSS_SELECTOR, "article h2 a")[:5]

os.makedirs("images", exist_ok=True)

spanish_titles = []

print("\n📰 First 5 Opinion Articles with Content and Images:\n")

for i, article in enumerate(articles, 1):
    article_url = article.get_attribute("href")

    driver.execute_script("window.open(arguments[0]);", article_url)
    driver.switch_to.window(driver.window_handles[1])
    time.sleep(5)

    try:
        title = driver.find_element(By.TAG_NAME, "h1").text
        spanish_titles.append(title)

        paragraphs = driver.find_elements(By.CSS_SELECTOR, "div[data-dtm-region='articulo_cuerpo'] p")
        content = "\n".join([p.text for p in paragraphs if p.text.strip() != ""])

        try:
            img = driver.find_element(By.CSS_SELECTOR, "figure img")
            img_url = img.get_attribute("src")
            img_data = requests.get(img_url).content
            img_filename = f"images/article_{i}.jpg"
            with open(img_filename, "wb") as f:
                f.write(img_data)
        except:
            img_url = "No image found"

        print(f"Article {i}: {title}")
        print(f"URL: {article_url}")
        print(f"Image: {img_url}")
        print("Content Preview:")
        print(content[:500], "...\n")

    except Exception as e:
        print(f"Error reading article {i}: {e}")

    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    time.sleep(2)

driver.quit()


print("\n Translated Titles:\n")
translator = Translator()
translated_titles = []

for i, title in enumerate(spanish_titles, 1):
    try:
        translated = translator.translate(title, src='es', dest='en')
        print(f"{i}. {translated.text}")
        translated_titles.append(translated.text)
    except Exception as e:
        print(f"{i}. Translation failed: {e}")
        translated_titles.append(title)


word_count = {}
for title in translated_titles:
    words = re.findall(r'\b\w+\b', title.lower())
    for word in words:
        word_count[word] = word_count.get(word, 0) + 1

print("\n Repeated Words in Translated Titles (appearing more than twice):\n")
for word, count in word_count.items():
    if count > 2:
        print(f"{word}: {count}")