-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathUsaplData.py
More file actions
80 lines (68 loc) · 3.1 KB
/
UsaplData.py
File metadata and controls
80 lines (68 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 14 11:47:41 2017
@author: yzamriy
Goal: Create UsaplData object containing a nested dictionary with
data scraped from target pages
This is a parent object for CompetitionList and RankingList objects
Input: reference, string. Part of the url address for target page
Main methods: 1. get_col_names: returns a list of key names in the dictionary
2. return_dict: returns nested dictionary with data
Requirements: Soup.py to get Beautiful Soup objects
"""
from Soup import *
import random
class UsaplData(object):
def __init__(self, reference):
'''
Goal: Initializes UsaplData object
Details: Competition object has following attributes:
self.reference: string. Part of the url address for target page
self.data_dict: nested dictionary with data scraped from
the target page:
Level 1 keys: text within a['href'] tag
Level 2 keys -> text within 'th' tags
Values -> text within 'td' tags
self.col_names: list with key names from the dictionary
'''
self.reference = reference
self.data_dict = self.build_dict()
self.col_names = self.get_col_names()
def build_dict(self):
"""
Goal: Builds nested dictionary with data scraped from the target page
First, the data from the target page is processed with
Beautiful Soup and put into a soup objest
Then we extract the data within "tabledata" tag
The data itself is contained within 'tr' tags
Table headers between 'th' tags will be the keys
Inputs: none
Returns: nested dictionary
"""
data_dict = {}
soup = getSoup(self.reference)
soup_table = soup.find("table", class_="tabledata").find('tbody').find_all('tr')
soup_colnames = soup.find("table", class_="tabledata").find('thead').find_all('th')
for row in soup_table:
row_name = row.find('a')['href']
data_dict[row_name] = {}
for cn, cv in zip(soup_colnames, row.find_all('td')):
col_names = cn.get_text()
col_value = cv.get_text().strip()
data_dict[row_name][col_names] = col_value
return data_dict
def get_col_names(self):
"""
Goal: Return column names corresponding to keys in the dictionary
Inputs: none
Returns: list of key names
"""
lifter = random.choice(list(self.data_dict))
col_names = list(self.data_dict[lifter].keys())
# Create a name for the top level keys in the dictionary
# that contain url reference
col_names.insert(0, 'Link')
return col_names
def return_dict(self):
return self.data_dict
#test = UsaplData('competitions')