import requests
import io
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import re
from datetime import datetime
def get_soup(url: str) -> BeautifulSoup:
response = requests.get(url)
return BeautifulSoup(response.content, 'html.parser')
def get_csv_from_url(url:str) -> pd.DataFrame:
s=requests.get(url).content
return pd.read_csv(io.StringIO(s.decode('utf-8')))
def print_tabulate(df: pd.DataFrame):
print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))
Código para extraer la información de transparencia de la UANL
<<includes-base-fn>>
def limpiar_nombre_dependencia(nombre_sucio:str)->str:
nombre_en_partes = nombre_sucio.split(' ')
return ' '.join(nombre_en_partes[2:])
def obtener_cantidad_de_filas(df: pd.DataFrame)-> int:
return len(df.index)
def limpiar_dato_sueldo(sueldo_txt: str)-> float:
return float(sueldo_txt[2:].replace(",", ""))
def get_dependencias_uanl()-> Tuple[List[str],List[str],List[str]]:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php")
table = soup.find_all("table")[0].find_all('tr')
listado_dependencias = [(option['value'], limpiar_nombre_dependencia(option.text)) \
for option in table[1].find_all("option")]
listado_meses = [option['value'] for option in table[2].find_all('td')[0].find_all("option")]
listado_anios = [option['value'] for option in table[2].find_all('td')[1].find_all("option")]
return (listado_dependencias,listado_meses, listado_anios)
def get_pages(periodo: str, area: str)-> List[str]:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act=1&id_area_form={area}&mya_det={periodo}")
try:
links = soup.find_all("table")[1].find_all('a')
except Exception as e:
print(e)
return []
return ['1'] + [link.text for link in links]
def get_info_transparencia_uanl(periodo: str, area: str, page:int = 1) -> pd.DataFrame:
soup = get_soup(f"http://transparencia.uanl.mx/remuneraciones_mensuales/bxd.php?pag_act={page}&id_area_form={area}&mya_det={periodo}")
table = soup.find_all("table")
try:
table_row = table[2].find_all('tr')
list_of_lists = [[row_column.text.strip() \
for row_column in row.find_all('td')] \
for row in table_row]
df = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
df["Sueldo Neto"] = df["Sueldo Neto"].transform(limpiar_dato_sueldo)
df = df.drop(['Detalle'], axis=1)
except Exception as e:
print(f"pagina sin informacion a: {area}, per: {periodo}, page:{page}")
print(e)
df = pd.DataFrame()
return df
def unir_datos(ldf: List[pd.DataFrame], dependencia:Tuple[str,str], mes: str, anio:str) -> pd.DataFrame:
if len(ldf) > 0:
df = pd.concat(ldf)
df["dependencia"] = [dependencia[1] for i in range(0, obtener_cantidad_de_filas(df))]
df["mes"] = [mes for i in range(0, obtener_cantidad_de_filas(df))]
df["anio"] = [anio for i in range(0, obtener_cantidad_de_filas(df))]
else:
df= pd.DataFrame()
return df
listado_dependencias, listado_meses, listado_anios = get_dependencias_uanl()
ldfs = []
for anio in listado_anios:
for mes in listado_meses:
for dependencia in listado_dependencias:
pages = get_pages(f"{mes}{anio}", dependencia[0])
print(f"m: {mes} a: {anio} d: {dependencia} p: {pages}")
ldf = [get_info_transparencia_uanl(f"{mes}{anio}", dependencia[0], page) for page in pages]
udf = unir_datos(ldf, dependencia, mes, anio)
ldfs.append(udf)
df = pd.concat(ldfs)
df.to_csv("csv/uanl2024.csv", index=False)Código para extraer la información de los estados de méxico de la pagina de wikipedia
<<includes-base-fn>>
def wiki() -> pd.DataFrame:
soup = get_soup("https://en.wikipedia.org/wiki/List_of_states_of_Mexico")
list_of_lists = [] # :List
# rows = soup.table.find_all('tr')
rows = soup.find_all("table")[0].find_all('tr')
for row in rows[1:]:
columns = row.find_all('td')
# listado_de_valores_en_columnas = []
# for column in columns:
# listado_de_valores_en_columnas.append(coulmn.text.strip())
listado_de_valores_en_columnas = [column.text.strip() for column in columns]
list_o_lists.append(listado_de_valores_en_columnas)
return pd.DataFrame(list_of_lists, columns=[header.text.strip() for header in rows[0].find_all('th')])
df = wiki()
print_tabulate(df)
df.to_csv("csv/estados.csv", index=False)def remove_repeated_number(str_repeated_value:str)->float:
if(type(str_repeated_value)!=str):
str_repeated_value = str(str_repeated_value)
str_sin_0 = re.sub("^0+", '', str_repeated_value)
str_sin_comma = str_sin_0.replace(',','')
num = 0.0
mitad = int(len(str_sin_comma)/2)
if len(str_sin_comma) % 2 == 0:
num = float(str_sin_comma[0:mitad])
return num
def extract_int_number(str_value:str)->int:
str_value_clean = re.findall(r'[\d,\.]*', str_value)[0]
str_sin_0 = re.sub("^0+", '', str_value_clean)
str_sin_comma = str_sin_0.replace(',','')
return float(str_sin_comma)
def remove_repeated_date(str_date_repeated:str) -> datetime:
return datetime.strptime(str_date_repeated[0:8],'%Y%m%d')
def limpiar_area(area:str)->Tuple[float,float]:
str_en_partes = re.findall(r'[\d,\.]*', area)
str_en_partes.remove('2')
blancos = str_en_partes.count('')
for blanco in range(0, blancos):
str_en_partes.remove('')
km_str = str_en_partes[0]
km_float = remove_repeated_number(km_str)
mi_str = str_en_partes[1]
mi_float = float(mi_str.replace(',',''))
return (km_float, mi_float)
df = pd.read_csv("csv/estados.csv")
df = df.drop(['Coat of arms'], axis=1)
# print(df.columns)
df.columns = ['estado',
'nombre_oficial',
'capital', 'ciudad_mas_grande', 'area', 'poblacion_2020',
'num_de_municipios', 'lugar',
'fecha_de_admision']
# print(df.columns)
df['lugar'] = df['lugar'].transform(remove_repeated_number)
df['poblacion_2020'] = df['poblacion_2020'].transform(remove_repeated_number)
df['fecha_de_admision'] = df['fecha_de_admision'].transform(remove_repeated_date)
df['num_de_municipios'] = df['num_de_municipios'].transform(extract_int_number)
areas= df['area'].transform(limpiar_area).to_list()
df['area_km2'] =[a[0] for a in areas]
df['area_mi'] =[a[1] for a in areas]
df = df.drop(['area'], axis=1)
print_tabulate(df)
df.to_csv("csv/estados_limpio.csv", index=False)Crear un data frame desde un archivo csv.
df = pd.read_csv("/home/jhernandez/Sync/FCFMClases/21-1FJ/DataMining/dm_lmv_6.csv")
print_tabulate(df)df = get_csv_from_url("https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv")
print_tabulate(df)
df.to_csv("csv/paises.csv", index=False)| Country | Region | |
|---|---|---|
| 0 | Algeria | AFRICA |
| 1 | Angola | AFRICA |
| 2 | Benin | AFRICA |
| 3 | Botswana | AFRICA |
| 4 | Burkina | AFRICA |
| 5 | Burundi | AFRICA |
| 6 | Cameroon | AFRICA |
| 7 | Cape Verde | AFRICA |
| 8 | Central African Republic | AFRICA |
| 9 | Chad | AFRICA |
| 10 | Comoros | AFRICA |
| 11 | Congo | AFRICA |
| 12 | Congo, Democratic Republic of | AFRICA |
| 13 | Djibouti | AFRICA |
| 14 | Egypt | AFRICA |
| 15 | Equatorial Guinea | AFRICA |
| 16 | Eritrea | AFRICA |
| 17 | Ethiopia | AFRICA |
| 18 | Gabon | AFRICA |
| 19 | Gambia | AFRICA |
| 20 | Ghana | AFRICA |
| 21 | Guinea | AFRICA |
| 22 | Guinea-Bissau | AFRICA |
| 23 | Ivory Coast | AFRICA |
| 24 | Kenya | AFRICA |
| 25 | Lesotho | AFRICA |
| 26 | Liberia | AFRICA |
| 27 | Libya | AFRICA |
| 28 | Madagascar | AFRICA |
| 29 | Malawi | AFRICA |
| 30 | Mali | AFRICA |
| 31 | Mauritania | AFRICA |
| 32 | Mauritius | AFRICA |
| 33 | Morocco | AFRICA |
| 34 | Mozambique | AFRICA |
| 35 | Namibia | AFRICA |
| 36 | Niger | AFRICA |
| 37 | Nigeria | AFRICA |
| 38 | Rwanda | AFRICA |
| 39 | Sao Tome and Principe | AFRICA |
| 40 | Senegal | AFRICA |
| 41 | Seychelles | AFRICA |
| 42 | Sierra Leone | AFRICA |
| 43 | Somalia | AFRICA |
| 44 | South Africa | AFRICA |
| 45 | South Sudan | AFRICA |
| 46 | Sudan | AFRICA |
| 47 | Swaziland | AFRICA |
| 48 | Tanzania | AFRICA |
| 49 | Togo | AFRICA |
| 50 | Tunisia | AFRICA |
| 51 | Uganda | AFRICA |
| 52 | Zambia | AFRICA |
| 53 | Zimbabwe | AFRICA |
| 54 | Afghanistan | ASIA |
| 55 | Bahrain | ASIA |
| 56 | Bangladesh | ASIA |
| 57 | Bhutan | ASIA |
| 58 | Brunei | ASIA |
| 59 | Burma | ASIA |
| 60 | Cambodia | ASIA |
| 61 | China | ASIA |
| 62 | East Timor | ASIA |
| 63 | India | ASIA |
| 64 | Indonesia | ASIA |
| 65 | Iran | ASIA |
| 66 | Iraq | ASIA |
| 67 | Israel | ASIA |
| 68 | Japan | ASIA |
| 69 | Jordan | ASIA |
| 70 | Kazakhstan | ASIA |
| 71 | Korea, North | ASIA |
| 72 | Korea, South | ASIA |
| 73 | Kuwait | ASIA |
| 74 | Kyrgyzstan | ASIA |
| 75 | Laos | ASIA |
| 76 | Lebanon | ASIA |
| 77 | Malaysia | ASIA |
| 78 | Maldives | ASIA |
| 79 | Mongolia | ASIA |
| 80 | Nepal | ASIA |
| 81 | Oman | ASIA |
| 82 | Pakistan | ASIA |
| 83 | Philippines | ASIA |
| 84 | Qatar | ASIA |
| 85 | Russian Federation | ASIA |
| 86 | Saudi Arabia | ASIA |
| 87 | Singapore | ASIA |
| 88 | Sri Lanka | ASIA |
| 89 | Syria | ASIA |
| 90 | Tajikistan | ASIA |
| 91 | Thailand | ASIA |
| 92 | Turkey | ASIA |
| 93 | Turkmenistan | ASIA |
| 94 | United Arab Emirates | ASIA |
| 95 | Uzbekistan | ASIA |
| 96 | Vietnam | ASIA |
| 97 | Yemen | ASIA |
| 98 | Albania | EUROPE |
| 99 | Andorra | EUROPE |
| 100 | Armenia | EUROPE |
| 101 | Austria | EUROPE |
| 102 | Azerbaijan | EUROPE |
| 103 | Belarus | EUROPE |
| 104 | Belgium | EUROPE |
| 105 | Bosnia and Herzegovina | EUROPE |
| 106 | Bulgaria | EUROPE |
| 107 | Croatia | EUROPE |
| 108 | Cyprus | EUROPE |
| 109 | Czech Republic | EUROPE |
| 110 | Denmark | EUROPE |
| 111 | Estonia | EUROPE |
| 112 | Finland | EUROPE |
| 113 | France | EUROPE |
| 114 | Georgia | EUROPE |
| 115 | Germany | EUROPE |
| 116 | Greece | EUROPE |
| 117 | Hungary | EUROPE |
| 118 | Iceland | EUROPE |
| 119 | Ireland | EUROPE |
| 120 | Italy | EUROPE |
| 121 | Latvia | EUROPE |
| 122 | Liechtenstein | EUROPE |
| 123 | Lithuania | EUROPE |
| 124 | Luxembourg | EUROPE |
| 125 | Macedonia | EUROPE |
| 126 | Malta | EUROPE |
| 127 | Moldova | EUROPE |
| 128 | Monaco | EUROPE |
| 129 | Montenegro | EUROPE |
| 130 | Netherlands | EUROPE |
| 131 | Norway | EUROPE |
| 132 | Poland | EUROPE |
| 133 | Portugal | EUROPE |
| 134 | Romania | EUROPE |
| 135 | San Marino | EUROPE |
| 136 | Serbia | EUROPE |
| 137 | Slovakia | EUROPE |
| 138 | Slovenia | EUROPE |
| 139 | Spain | EUROPE |
| 140 | Sweden | EUROPE |
| 141 | Switzerland | EUROPE |
| 142 | Ukraine | EUROPE |
| 143 | United Kingdom | EUROPE |
| 144 | Vatican City | EUROPE |
| 145 | Antigua and Barbuda | NORTH AMERICA |
| 146 | Bahamas | NORTH AMERICA |
| 147 | Barbados | NORTH AMERICA |
| 148 | Belize | NORTH AMERICA |
| 149 | Canada | NORTH AMERICA |
| 150 | Costa Rica | NORTH AMERICA |
| 151 | Cuba | NORTH AMERICA |
| 152 | Dominica | NORTH AMERICA |
| 153 | Dominican Republic | NORTH AMERICA |
| 154 | El Salvador | NORTH AMERICA |
| 155 | Grenada | NORTH AMERICA |
| 156 | Guatemala | NORTH AMERICA |
| 157 | Haiti | NORTH AMERICA |
| 158 | Honduras | NORTH AMERICA |
| 159 | Jamaica | NORTH AMERICA |
| 160 | Mexico | NORTH AMERICA |
| 161 | Nicaragua | NORTH AMERICA |
| 162 | Panama | NORTH AMERICA |
| 163 | Saint Kitts and Nevis | NORTH AMERICA |
| 164 | Saint Lucia | NORTH AMERICA |
| 165 | Saint Vincent and the Grenadines | NORTH AMERICA |
| 166 | Trinidad and Tobago | NORTH AMERICA |
| 167 | United States | NORTH AMERICA |
| 168 | Australia | OCEANIA |
| 169 | Fiji | OCEANIA |
| 170 | Kiribati | OCEANIA |
| 171 | Marshall Islands | OCEANIA |
| 172 | Micronesia | OCEANIA |
| 173 | Nauru | OCEANIA |
| 174 | New Zealand | OCEANIA |
| 175 | Palau | OCEANIA |
| 176 | Papua New Guinea | OCEANIA |
| 177 | Samoa | OCEANIA |
| 178 | Solomon Islands | OCEANIA |
| 179 | Tonga | OCEANIA |
| 180 | Tuvalu | OCEANIA |
| 181 | Vanuatu | OCEANIA |
| 182 | Argentina | SOUTH AMERICA |
| 183 | Bolivia | SOUTH AMERICA |
| 184 | Brazil | SOUTH AMERICA |
| 185 | Chile | SOUTH AMERICA |
| 186 | Colombia | SOUTH AMERICA |
| 187 | Ecuador | SOUTH AMERICA |
| 188 | Guyana | SOUTH AMERICA |
| 189 | Paraguay | SOUTH AMERICA |
| 190 | Peru | SOUTH AMERICA |
| 191 | Suriname | SOUTH AMERICA |
| 192 | Uruguay | SOUTH AMERICA |
| 193 | Venezuela | SOUTH AMERICA |