Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 12 additions & 20 deletions GFG_article_extractor.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,36 @@
#--------------------------------------------------
#--------------------------------------------------
# Name: GeeksForGeeks Article Extractor
# Purpose: To download and save articles filed under each and every tag mentioned in www.geeksforgeeks.org
#
#--------------------------------------------------
#--------------------------------------------------


#!/usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib2
import os


AllTags = ['interview-experience','advance-data-structures','dynamic-programming','Greedy-Algorithm','backtracking','pattern-searching','divide-and-conquer','graph','MathematicalAlgo','recursion','Java']

path = "E:\GeeksForGeeks\\" # Specify your path here

AllTags = ['Greedy-Algorithm']

path = ''

def ExtractMainLinks(AllTags,path):
for i in AllTags:
newpath = path + i
os.mkdir(newpath)
url = "http://www.geeksforgeeks.org/tag/" + i +"/"
data = urllib2.urlopen(url).read()
soup = BeautifulSoup(data)
allLinks = soup.findAll("h2",class_="post-title")
soup = BeautifulSoup(data,"html.parser")
allLinks = soup.findAll("h2",class_="entry-title")
listofLinks = []
for link in allLinks:
mainLink = str(link.findAll("a")[0]).split("<a href=")[1].split('rel="bookmark"')[0].strip('"').split('"')[0]
listofLinks.append(mainLink)
Extract_And_Save_Page_Data(listofLinks,newpath,i)
listofLinks.append([mainLink,mainLink.split('/')[3]])
Extract_And_Save_Page_Data(listofLinks)



def Extract_And_Save_Page_Data(listofLinks,newpath,i):
def Extract_And_Save_Page_Data(listofLinks):
No = 0
for item in listofLinks:
pageData = urllib2.urlopen(item).read()
filePath = newpath +"\\" +str(i)+" "+str(No+1)+".html"
pageData = urllib2.urlopen(item[0]).read()
filePath = item[1]+".html"
No = No +1
with open(filePath,"wb") as f:
f.write(str(pageData))
Expand Down