Skip to content

Commit 750b0af

Browse files
EEDEED
authored andcommitted
webscrapping Level 1
1 parent 7e09002 commit 750b0af

1 file changed

Lines changed: 45 additions & 3 deletions

File tree

web.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@
5757
df1='' #Pandas Dataframes
5858
sheet_out = "Tabelle1"
5959
# Steuervariablen
60-
max_web_i = 2 #maximum loop for webscrapping
60+
max_web_i = [float('Inf') #cat0 -> is not used
61+
,float('Inf') #cat1
62+
,2 #cat2
63+
] #maximum loop for webscrapping
6164
# max_web_i = float('Inf') # scrap all
6265

6366
# URLs
@@ -89,7 +92,6 @@
8992
logging.info('TEST')
9093
else:
9194
r = requests.get(url);cr = cr+1
92-
r
9395
logging.debug('REQUEST No ' + str(cr) + ': ' + url )
9496
c = r.content
9597
soup = BeautifulSoup(c,"html.parser")
@@ -124,7 +126,47 @@
124126

125127
logging.info('\n' + '-'*30 + '\n1.2) Cat1\n' + '-'*30 + '\n')
126128

127-
l[0]
129+
cat_label = 'Kategorie_1'
130+
file = r'out' + '\\' + cat_label
131+
132+
if load_df[1]:
133+
df2 = pd.read_pickle(file + '.pkl')
134+
else:
135+
I = 0;l=[]
136+
for index,row in df1.iterrows():
137+
url = row["url"]
138+
r = requests.get(url);cr = cr+1
139+
logging.debug('REQUEST No ' + str(cr) + ': ' + url )
140+
c = r.content
141+
soup = BeautifulSoup(c,"html.parser")
142+
all_cat1 = soup.find("aside",{"class":"sidemenu categories syncheight"})
143+
sublinks = all_cat1.find_all('li',class_="")
144+
len(sublinks)
145+
sublinks[0].find('a')
146+
for link in sublinks:
147+
link_a = link.find('a')
148+
has_link = link_a.has_attr('href')
149+
if has_link:
150+
d={}
151+
I+=1
152+
logging.debug('I=' + str(I) + ' - ' + link_a.text + ' - ' + link_a['href'])
153+
d["index"] = I
154+
if TEST:
155+
d["url"] = link_a['href'] #put absolute URL in offline file
156+
else:
157+
d["url"] = url_root + link_a['href']
158+
d["Kategorie - Level0"] = row["Kategorie - Level0"]
159+
d["Kategorie - Level1"] = link_a.text
160+
d["Kategorie - Level2"] = ""
161+
l.append(d)
162+
if index >= max_web_i[1]-1:
163+
logging.warning('Maximum number of webscapping : {}'.format(max_web_i))
164+
break
165+
df2 = DataFrame(l)
166+
logging.info(df1.head())
167+
df2.to_csv( file + '.csv')
168+
df2.to_pickle(file + '.pkl')
169+
128170
T = time.clock()
129171
RUNTIME = time.strftime("%H:%M:%S", time.gmtime(T-t0))
130172
logging.info('END \n\n RUNTIME :' + RUNTIME)

0 commit comments

Comments
 (0)