|
57 | 57 | df1='' #Pandas Dataframes |
58 | 58 | sheet_out = "Tabelle1" |
59 | 59 | # Steuervariablen |
60 | | -max_web_i = 2 #maximum loop for webscrapping |
| 60 | +max_web_i = [float('Inf') #cat0 -> is not used |
| 61 | + ,float('Inf') #cat1 |
| 62 | + ,2 #cat2 |
| 63 | + ] #maximum loop for webscrapping |
61 | 64 | # max_web_i = float('Inf') # scrap all |
62 | 65 |
|
63 | 66 | # URLs |
|
89 | 92 | logging.info('TEST') |
90 | 93 | else: |
91 | 94 | r = requests.get(url);cr = cr+1 |
92 | | - r |
93 | 95 | logging.debug('REQUEST No ' + str(cr) + ': ' + url ) |
94 | 96 | c = r.content |
95 | 97 | soup = BeautifulSoup(c,"html.parser") |
|
124 | 126 |
|
125 | 127 | logging.info('\n' + '-'*30 + '\n1.2) Cat1\n' + '-'*30 + '\n') |
126 | 128 |
|
127 | | -l[0] |
| 129 | +cat_label = 'Kategorie_1' |
| 130 | +file = r'out' + '\\' + cat_label |
| 131 | + |
| 132 | +if load_df[1]: |
| 133 | + df2 = pd.read_pickle(file + '.pkl') |
| 134 | +else: |
| 135 | + I = 0;l=[] |
| 136 | + for index,row in df1.iterrows(): |
| 137 | + url = row["url"] |
| 138 | + r = requests.get(url);cr = cr+1 |
| 139 | + logging.debug('REQUEST No ' + str(cr) + ': ' + url ) |
| 140 | + c = r.content |
| 141 | + soup = BeautifulSoup(c,"html.parser") |
| 142 | + all_cat1 = soup.find("aside",{"class":"sidemenu categories syncheight"}) |
| 143 | + sublinks = all_cat1.find_all('li',class_="") |
| 144 | + len(sublinks) |
| 145 | + sublinks[0].find('a') |
| 146 | + for link in sublinks: |
| 147 | + link_a = link.find('a') |
| 148 | + has_link = link_a.has_attr('href') |
| 149 | + if has_link: |
| 150 | + d={} |
| 151 | + I+=1 |
| 152 | + logging.debug('I=' + str(I) + ' - ' + link_a.text + ' - ' + link_a['href']) |
| 153 | + d["index"] = I |
| 154 | + if TEST: |
| 155 | + d["url"] = link_a['href'] #put absolute URL in offline file |
| 156 | + else: |
| 157 | + d["url"] = url_root + link_a['href'] |
| 158 | + d["Kategorie - Level0"] = row["Kategorie - Level0"] |
| 159 | + d["Kategorie - Level1"] = link_a.text |
| 160 | + d["Kategorie - Level2"] = "" |
| 161 | + l.append(d) |
| 162 | + if index >= max_web_i[1]-1: |
| 163 | + logging.warning('Maximum number of webscapping : {}'.format(max_web_i)) |
| 164 | + break |
| 165 | + df2 = DataFrame(l) |
| 166 | + logging.info(df1.head()) |
| 167 | + df2.to_csv( file + '.csv') |
| 168 | + df2.to_pickle(file + '.pkl') |
| 169 | + |
128 | 170 | T = time.clock() |
129 | 171 | RUNTIME = time.strftime("%H:%M:%S", time.gmtime(T-t0)) |
130 | 172 | logging.info('END \n\n RUNTIME :' + RUNTIME) |
|
0 commit comments