-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
78 lines (68 loc) · 3.04 KB
/
app.py
File metadata and controls
78 lines (68 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import httpcore
from bs4 import BeautifulSoup
import httpx
import asyncio
import json
def extract_int(string):
return int(re.sub("[^0-9.\-]", "", string))
async def fetch_company(pathname):
client = httpx.AsyncClient()
while True:
try:
response = await client.get('https://work.mma.go.kr' + pathname)
except httpcore._exceptions.TimeoutException:
continue
break
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.select_one('#content > div:nth-child(1) > table > tbody > tr:nth-child(1) > td').text)
return {
'name': soup.select_one('#content > div:nth-child(1) > table > tbody > tr:nth-child(1) > td').text,
'address': soup.select_one('#content > div:nth-child(1) > table > tbody > tr:nth-child(2) > td').text,
'tel': soup.select_one(
'#content > div:nth-child(1) > table > tbody > tr:nth-child(3) > td:nth-child(2)').text,
'fax': soup.select_one(
'#content > div:nth-child(1) > table > tbody > tr:nth-child(3) > td:nth-child(4)').text,
'kind': soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(1) > td:nth-child(2)').text,
'scale': soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)').text,
'active_service': [
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(2)').text),
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(4) > td:nth-child(2)').text),
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(5) > td:nth-child(2)').text),
],
'recruit_service': [
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(4)').text),
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(4) > td:nth-child(4)').text),
extract_int(soup.select_one(
'#content > div:nth-child(2) > table > tbody > tr:nth-child(5) > td:nth-child(4)').text),
]
}
async def main():
data = {
'al_eopjong_gbcd': '11111,11112',
'eopjong_gbcd_list': '11111,11112',
'eopjong_gbcd': '1',
'pageUnit': '1000',
'pageIndex': '1',
}
response = httpx.post('https://work.mma.go.kr/caisBYIS/search/byjjecgeomsaek.do', data=data)
soup = BeautifulSoup(response.content, 'html.parser')
select_rows = soup.select('#content > table > tbody > tr > th > a')
print(len(select_rows))
tasks = list()
for row in select_rows:
tasks.append(asyncio.create_task(fetch_company(row['href'])))
res = await asyncio.gather(*tasks)
print(res)
with open('agent.json', 'w') as fp:
json.dump(res, fp)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()