-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathScrapper.py
More file actions
71 lines (55 loc) · 2.61 KB
/
Scrapper.py
File metadata and controls
71 lines (55 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import requests
from bs4 import BeautifulSoup
def download_file(url, path):
response = requests.get(url)
if response.status_code == 200:
with open(path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {path}")
else:
print(f"Failed to download file. Status code: {response.status_code}")
def is_github_folder_with_files(url):
# Send an HTTP request to the GitHub repository page
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# Check for the presence of an element or class associated with folders
folder_indicator = soup.find('span', class_='css-truncate-target')
# Check for the presence of files within the folder
file_indicator = soup.find('a', class_='js-navigation-open Link--primary')
return folder_indicator is not None or file_indicator is not None
else:
print(f"Failed to retrieve the page. Status code: {response.status_code}")
return False
# Example usage:
url = 'https://github.com/CommanderChrisOrg/CommanderChris/tree/main/src'
if is_github_folder_with_files(url):
print(f"The URL '{url}' represents a GitHub folder with files.")
else:
print(f"The URL '{url}' does not represent a GitHub folder with files.")
def scrape_github_repository(url):
# Send an HTTP request to the GitHub repository page
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# Create a directory with the repository name
repo_dir = 'CommanderChrisOrg_CommanderChris'
base_path = "/Users/diliarakaniazova/Downloads"
os.makedirs(os.path.join(base_path, repo_dir), exist_ok=True)
# Find and download all links (files) on the page
links = soup.find_all('a', {'class': 'js-navigation-open'})
for link in links:
if is_github_folder_with_files(url)
file_url = 'https://github.com' + link.get('href') + '?raw=true'
file_name = os.path.join(os.path.join(base_path, repo_dir), link.get('title'))
download_file(file_url, file_name)
else:
print(f"Failed to retrieve the repository page. Status code: {response.status_code}")
# Example usage
github_url = 'https://github.com/CommanderChrisOrg/CommanderChris'
scrape_github_repository(github_url)