Skip to content
52 changes: 52 additions & 0 deletions src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,20 @@ import AllApps from "./Python_Library_Pages/AllApps";
import PythonBasics from "./Python_Library_Pages/Python_Basics/Introduction-to-Python";
import NumpyBasics from "./Python_Library_Pages/Numpy/Intro-to-Numpy";
import MatplotlibBasics from "./Python_Library_Pages/Matplotlib/Intro-to-Matplotlib";
import BeautifulSoupBasics from "./Python_Library_Pages/BeautifulSoup/Intro-to-BeautifulSoup";
import GettingStartedBS from "./Python_Library_Pages/BeautifulSoup/Getting-Started-With-BS";
import BasicsBeautifulSoup from "./Python_Library_Pages/BeautifulSoup/Basics-of-BeautifulSoup";
import NavigatingHTMLTree from "./Python_Library_Pages/BeautifulSoup/Navigating-HTML-Tree";
import DataFromWebPages from "./Python_Library_Pages/BeautifulSoup/Extracting-Data-From-WebPage";
import HandleComplexHTML from "./Python_Library_Pages/BeautifulSoup/Handling-Complex-HTML";
import RealWorldExamples from "./Python_Library_Pages/BeautifulSoup/Real-World-Examples";
import AdvanceWebScrapping from "./Python_Library_Pages/BeautifulSoup/Advance-Web-Scrapping";
import OperatorsBasics from "./Python_Library_Pages/Python_Basics/Intrduction-to-Operators";
import FunctionsBasics from "./Python_Library_Pages/Python_Basics/Introduction-to-Functions";
import PandasBasics from "./Python_Library_Pages/Pandas/Intro-to-Pandas";
import OperatorsBasics from "./Python_Library_Pages/Python_Basics/Intrduction-to-Operators";
import FunctionsBasics from "./Python_Library_Pages/Python_Basics/Introduction-to-Functions";

import PlayGround from "./Python/PlayGround";

import Navbar from "./Components/Navbar";
Expand Down Expand Up @@ -49,14 +60,55 @@ const App = () => {
<Route path="Intro-to-Matplotlib" element={<MatplotlibBasics />} />
</Route>

<Route path="BeautifulSoup-Library" element={<Outlet />}>
<Route
path="Intro-to-BeautifulSoup"
element={<BeautifulSoupBasics />}
/>
<Route
path="Getting-Started-With-BS"
element={<GettingStartedBS />}
/>
<Route
path="Basics-of-BeautifulSoup"
element={<BasicsBeautifulSoup />}
/>
<Route
path="Navigating-HTML-Tree"
element={<NavigatingHTMLTree />}
/>
<Route
path="Extracting-Data-From-WebPage"
element={<DataFromWebPages />}
/>
<Route
path="Handling-Complex-HTML"
element={<HandleComplexHTML />}
/>
<Route
path="Advance-Web-Scrapping"
element={<AdvanceWebScrapping />}
/>
<Route
path="Real-World-Examples"
element={<RealWorldExamples />}
/>

{/* Seaborn */}
<Route path="/Seaborn" element={<Outlet />}>
<Route path="Introduction-to-seaborn" element={<Seaborn />} />
</Route>


{/* TensorFlow */}
<Route path="/TensorFlow" element={<Outlet />}>
<Route path="Introduction-to-tensorFlow" element={<TensorFlow />} />
<Route path="Installation" element={<Installation />}/>
<Route
path="Introduction-to-Tensors"
element={<Tensors />}
/>
<Route path="Introduction-to-Variables" element={<Variables />} />
</Route>

<Route path="/Flask" element={<Outlet />}>
Expand Down
43 changes: 43 additions & 0 deletions src/Constants/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,46 @@ export const subMenusList = [
],
},
{
name: "BeautifulSoup-Library",
title: "BeautifulSoup Library",
route: "/BeautifulSoup-Library/intro-to-BeautifulSoup",
children: [
{
title: "Intro to BeautifulSoup",
route: "Intro-to-BeautifulSoup",
},
{
title: "Getting Started With BeautifulSoup",
route: "Getting-Started-With-BS",
},
{
title: "Basics of BeautifulSoup",
route: "Basics-of-BeautifulSoup",
},
{
title: "Navigating The HTML Tree",
route: "Navigating-HTML-Tree",
},
{
title: "Extracting Data From Web-Pages",
route: "Extracting-Data-From-WebPage",
},
{
title: "Handling Complex HTML Structures",
route: "Handling-Complex-HTML",
},
{
title: "Real-World Examples and Case Studies",
route: "ReaL-World-Examples",
},
{
title: "Advanced Web Scraping Techniques",
route: "Advance-Web-Scrapping",
}
],
},


name: "Pillow (PIL)",
title: "Pillow (PIL)",
route: "/Pillow-(PIL)/Introduction-to-Pillow-(PIL)",
Expand Down Expand Up @@ -561,6 +601,9 @@ export const subMenusList = [
route: "Introduction-to-TensorFlow",
},
],
}


},
{
name: "Tkinter",
Expand Down
187 changes: 187 additions & 0 deletions src/Python_Library_Pages/BeautifulSoup/Advance-Web-Scrapping.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import React from "react";

const AdvanceWebScrapping = () => {
return (
<div>
<h1><strong>Advanced Web Scraping Techniques </strong> </h1> <br />

<p>Advanced web scraping often involves dealing with more complex scenarios, such as handling pagination, interacting with JavaScript-based websites, and avoiding web scraping restrictions. In this section, we will explore advanced web scraping techniques using Beautiful Soup and related tools.</p>
<br />
<h1><strong>Handling Pagination </strong> </h1> <br />
<p>Pagination is common on websites that display data across multiple pages, such as search results or product listings. To scrape data from multiple pages, you need to navigate through each page and extract the desired information. Here's a high-level approach:</p>
<br />

<p>
<strong> Retrieve the First Page:</strong> Fetch the HTML content of the first page and parse it with Beautiful Soup. <br /> <br />

<strong>Extract Data:</strong> Extract the data you need from the first page. <br /> <br />

<strong>Identify Pagination Mechanism:</strong> Find elements or controls that allow you to navigate to the next page (e.g., "Next" buttons or page numbers). <br /> <br />

<strong> Iterate Through Pages:</strong> Use a loop to iterate through the pages by following the pagination mechanism, fetching each page's content, and extracting data.</p> <br /> <br />
<p>Here's an example of scraping search results from a paginated website:</p> <br />

<p className="snippet">
<pre>
<code>
{`import requests
from bs4 import BeautifulSoup

base_url = "https://example.com/search?q="
page_number = 1
while True:
url = f"{base_url}{page_number}"
response = requests.get(url)
if response.status_code != 200:
break # Stop if the page is not found or an error occurs

soup = BeautifulSoup(response.text, 'html.parser')
# Extract data from the current page

# Find the "Next" button or page number for the next page
next_button = soup.find('a', class_='next')
if not next_button:
break # No more pages to scrape

page_number += 1
`
}
</code>
</pre>
</p> <br />

<div class="content">
<h1><strong>Handling JavaScript-Driven Websites</strong></h1> <br />
<p>Some websites load content dynamically using JavaScript, making traditional web scraping challenging. In such cases, consider using tools like Selenium in combination with Beautiful Soup. Selenium allows you to automate web interactions and retrieve data from pages that rely on JavaScript to render content.</p> <br />
<br />

<p>Here's a basic example of using Selenium with Beautiful Soup:</p> <br />
<p className="snippet">
<pre>
<code>
{`from selenium import webdriver
from bs4 import BeautifulSoup

# Set up a Selenium webdriver (you need to install the appropriate driver)
driver = webdriver.Chrome(executable_path='path/to/chromedriver')

# Load a webpage with JavaScript content
driver.get('https://example.com/some-page')

# Wait for the page to load completely (you may need to adjust the wait time)
import time
time.sleep(5)

# Get the page source after JavaScript rendering
page_source = driver.page_source

# Parse the page source with Beautiful Soup
soup = BeautifulSoup(page_source, 'html.parser')

# Extract data from the page
# ...

# Don't forget to close the driver when done
driver.quit()
`
}
</code>
</pre>

</p> <br />
</div>


<div class="content">
<h1><strong>Avoiding Web Scraping Restrictions</strong></h1> <br />
<p>Some websites actively discourage or block web scraping. To overcome restrictions and avoid being detected as a scraper, consider the following techniques:</p> <br />

<p>
<strong> Use User Agents:</strong> Set a User-Agent header in your requests to mimic a real browser. <br />
<strong> Limit Request Rate: </strong> Avoid making too many requests in a short period; use delays between requests. <br />

<strong> Rotate IP Addresses:</strong> If possible, use a rotating IP proxy service to prevent IP bans. <br />

<strong> Use Headless Browsing:</strong> Use headless browsers like Selenium with the --headless option to run without a visible browser window. <br />

<strong> Use Request Session: </strong> Utilize the requests library's session feature to persist cookies and maintain a session. <br />

<strong> Handle CAPTCHAs:</strong> If a website uses CAPTCHAs, consider using CAPTCHA-solving services or manual intervention <br />

</p>
</div>

<div class="content">
<h1><strong>Dealing with Dynamic Content</strong></h1> <br />
<p>Some websites load content dynamically using AJAX or other techniques. To scrape such content, you can inspect network requests made by the website and simulate those requests in your scraping script. Tools like Browser Developer Tools (e.g., Chrome DevTools) can help you identify the relevant network requests and parameters.</p> <br />

<p>Additionally, libraries like Requests and Selenium allow you to send HTTP requests and handle dynamic content retrieval programmatically.</p>
</div>


<div class="content">
<h1><strong>Handling Login and Authentication</strong></h1> <br />
<p>For websites that require user authentication, you can use tools like Selenium to automate login processes. Here's a simplified example:</p> <br />

<p className="snippet">

<pre>
<code>
{`from selenium import webdriver

# Set up Selenium
driver = webdriver.Chrome(executable_path='path/to/chromedriver')

# Open the login page
driver.get('https://example.com/login')

# Fill in the login form fields
username_input = driver.find_element_by_name('username')
password_input = driver.find_element_by_name('password')
username_input.send_keys('your_username')
password_input.send_keys('your_password')

# Submit the form
login_button = driver.find_element_by_xpath('//button[@type="submit"]')
login_button.click()

# Continue scraping authenticated content
# ...

# Don't forget to close the driver when done
driver.quit()

`
}
</code>
</pre>
</p> <br />
</div>


<div class="content">
<h1><strong>Robots.txt</strong></h1> <br />
<p>Before scraping a website, it's important to check its robots.txt file, which provides guidelines on whether web crawlers are allowed and which parts of the website they can access. Always respect the rules outlined in the robots.txt file to avoid legal issues and maintain good web scraping practices.</p> <br />


<h1><strong>Error Handling and Retry Strategies</strong></h1> <br />
<p>When scraping large amounts of data or dealing with network requests, errors can occur. Implement robust error handling and retry strategies to handle timeouts, network issues, and other unexpected problems gracefully. This may include logging errors, delaying retries, or changing IP addresses.</p> <br />


<h1><strong>Legal and Ethical Considerations</strong></h1> <br />
<p>Always ensure that your web scraping activities comply with legal and ethical guidelines. Respect website terms of service, privacy policies, and copyright laws. Scraping should be for legitimate purposes, and you should avoid scraping sensitive or personal information.</p> <br />


<h1><strong>Rate Limiting and Throttling</strong></h1> <br />
<p>To avoid overloading a website's server with requests, implement rate limiting and throttling mechanisms in your scraping script. This can help you stay within acceptable usage limits and maintain a good</p> <br />


</div>



</div>
);
};

export default AdvanceWebScrapping;
Loading