-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
36 lines (25 loc) · 959 Bytes
/
scrape.py
File metadata and controls
36 lines (25 loc) · 959 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from bs4 import BeautifulSoup
import requests
import pandas as pd
def fetch_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
sections = soup.select('h3')
content = []
for sec in sections:
title = sec.get_text(strip=True)
next_el = sec.find_next_sibling()
if next_el and next_el.name in ('ol', 'ul'):
for li in next_el.find_all('li', recursive=False):
raw_text = li.get_text("\n", strip=True)
lines = raw_text.split("\n", 1)
content.append({'section': title, 'question': lines[0]})
return content
def main():
url = 'https://razorops.com/blog/top-100-ai-ml-interview-questions-and-answers'
qa_data = fetch_data(url)
df = pd.DataFrame(qa_data)
df.to_csv('qa_data.csv', index=False)
print(f"Extracted {len(qa_data)} Q&A pairs from {url}")
if __name__ == '__main__':
main()