MATBot-AI-Assistant/bfs_scraper.py at main · SingletLinkage/MATBot-AI-Assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.parse import urljoin, urlparse
from structurify import struct_text
import os
import logging
from collections import deque
from tqdm import tqdm
import time

# Configure logging
logging.basicConfig(
    filename='scraper_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class BFSScraper:
    def __init__(self, base_url, max_depth=2, delay=1):
        """
        Initialize the BFS scraper

        Args:
            base_url: The starting URL for scraping
            max_depth: Maximum depth of links to follow (default: 2)
            delay: Time to wait between requests in seconds (default: 1)
        """
        self.base_url = base_url
        self.max_depth = max_depth
        self.delay = delay
        self.visited_urls = set()
        self.queue = deque()
        self.all_corpus = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Ubuntu/22.04'
        }

        # Create necessary directories
        if not os.path.exists('corpus'):
            os.makedirs('corpus')

    def is_valid_url(self, url):
        """Check if URL belongs to the same domain as base_url"""
        base_domain = urlparse(self.base_url).netloc
        url_domain = urlparse(url).netloc
        return base_domain in url_domain or not url_domain

    def get_page_content(self, url):
        """Fetch and parse page content, returns soup object or None if failed"""
        try:
            response = requests.get(url)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            logging.error(f"Failed to fetch {url}: {str(e)}")
            return None

    def extract_links(self, soup, current_url):
        """Extract all links from the page's main content area only"""
        links = []
        if soup:
            # Look specifically in the main content area
            main_content = soup.find('div', id='pgtype-topic')

            # If main content area is found, extract links only from there
            if main_content:
                for a_tag in main_content.find_all('a', href=True):
                    href = a_tag.get('href')
                    if href and not href.startswith(('#', 'javascript:', 'mailto:')):
                        absolute_url = urljoin(current_url, href)
                        if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls:
                            link_text = a_tag.get_text(strip=True)
                            links.append((link_text, absolute_url))
            else:
                logging.warning(f"No main content div found on {current_url}")
        return links

    def extract_images(self, soup, current_url):
        """Extract image URLs from the page's main content area only"""
        images = []
        if soup:
            # Look specifically in the main content area
            main_content = soup.find('div', id='pgtype-topic')

            # If main content area is found, extract images only from there
            if main_content:
                for img_tag in main_content.find_all('img', src=True):
                    img_src = img_tag.get('src')
                    absolute_img_url = urljoin(current_url, img_src)
                    images.append(absolute_img_url)
        return images

    def process_page(self, url, depth):
        """Process a single page: extract content, links and images"""
        if url in self.visited_urls:
            return []

        self.visited_urls.add(url)
        soup = self.get_page_content(url)

        if not soup:
            return []

        # Extract main content
        main_content = soup.find('div', id='pgtype-topic')
        if not main_content:
            logging.warning(f"No main content found on {url}, skipping content extraction")
            main_content = None  # Don't use whole page if pgtype-topic not found

            # Still return links for next level if we're not at max depth
            if depth < self.max_depth:
                return self.extract_links(soup, url)
            return []

        # Extract images
        images = self.extract_images(soup, url)

        # Generate a filename based on the URL
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.strip('/').split('/')
        filename = path_parts[-1] if path_parts else 'index'
        if not filename.endswith('.html'):
            filename += '.html'

        # Get title or use formatted URL if title is blank
        title = soup.title.get_text().strip() if soup.title else None
        if not title:
            # Format the URL into a readable title
            path = parsed_url.path.strip('/')
            if path:
                # Extract last part of the path and replace hyphens/underscores with spaces
                title = path.split('/')[-1]
                title = title.replace('-', ' ').replace('_', ' ').replace('.html', '')
                # Capitalize first letter of each word
                title = ' '.join(word.capitalize() for word in title.split())
            else:
                # If no path, use the domain name
                title = parsed_url.netloc

        op_file_name = title.replace(" ", "_") + ".json"
        op_file_name = op_file_name.replace(":", "_")
        op_file_name = op_file_name.replace("/", "_")

        # Only save to file if main content was found
        if main_content:
            save_path = os.path.join('corpus', op_file_name)
            try:
                # Add images to the json
                _json = struct_text(str(main_content), save_path, link=url, return_json=True)
                _json['images'] = images
                self.all_corpus.append(_json)

                # Also save the individual file
                with open(save_path, 'w', encoding='utf-8') as f:
                    json.dump(_json, f, ensure_ascii=False, indent=4)

                logging.info(f"Successfully scraped and saved: {url}")
            except Exception as e:
                logging.error(f"Error processing {url}: {str(e)}")

        # Return links for the next level if we're not at max depth
        if depth < self.max_depth:
            return self.extract_links(soup, url)
        return []


    def start_scraping(self):
        """Start the BFS scraping process"""
        # Add the starting URL to the queue with depth 0
        self.queue.append((self.base_url, 0))

        with tqdm(desc="Scraping pages") as pbar:
            while self.queue:
                current_url, depth = self.queue.popleft()

                # Skip if already visited
                if current_url in self.visited_urls:
                    continue

                logging.info(f"Processing URL: {current_url} at depth {depth}")

                # Process the page and get new links
                new_links = self.process_page(current_url, depth)

                # Add new links to the queue
                for text, link in new_links:
                    if link not in self.visited_urls:
                        self.queue.append((link, depth + 1))

                pbar.update(1)
                pbar.set_postfix({"Depth": depth, "Queue": len(self.queue), "Visited": len(self.visited_urls)})

                # Respect the delay between requests
                time.sleep(self.delay)

        # Save all corpus to a single file
        with open('corpus.json', 'w', encoding='utf-8') as f:
            json.dump(self.all_corpus, f, ensure_ascii=False, indent=4)
        logging.info(f"All corpus saved to corpus.json with {len(self.all_corpus)} pages")
        print(f"Scraping completed. Processed {len(self.visited_urls)} URLs. Results saved in 'corpus.json'")

        # Generate a CSV with all links
        self.save_links_to_csv()

    def save_links_to_csv(self):
        """Save all visited links to a CSV file"""
        df = pd.DataFrame({
            'URL': list(self.visited_urls)
        })
        df.to_csv('scraped_links.csv', index=False)
        logging.info(f"Saved {len(df)} links to scraped_links.csv")

if __name__ == "__main__":
    # URL to scrap
    base_url = "https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html"

    # Create scraper with max depth 3
    scraper = BFSScraper(base_url, max_depth=3)

    # Start scraping
    scraper.start_scraping()