topic_analysis/scrape.py at master · alexjyc/topic_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from bs4 import BeautifulSoup
import requests
import pandas as pd


def fetch_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    sections = soup.select('h3')

    content = []
    for sec in sections:
        title = sec.get_text(strip=True)

        next_el = sec.find_next_sibling()
        if next_el and next_el.name in ('ol', 'ul'):
            for li in next_el.find_all('li', recursive=False):
                raw_text = li.get_text("\n", strip=True)
                lines = raw_text.split("\n", 1)

                content.append({'section': title, 'question': lines[0]})

    return content

def main():
    url = 'https://razorops.com/blog/top-100-ai-ml-interview-questions-and-answers'
    qa_data = fetch_data(url)

    df = pd.DataFrame(qa_data)
    df.to_csv('qa_data.csv', index=False)

    print(f"Extracted {len(qa_data)} Q&A pairs from {url}")

if __name__ == '__main__':
    main()