-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetDpc.py
More file actions
executable file
·94 lines (80 loc) · 2.42 KB
/
getDpc.py
File metadata and controls
executable file
·94 lines (80 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python
# -*- coding: utf-8 -*-
import dbconfig
import sys
import requests
import re
import html
from bs4 import BeautifulSoup
import dpcExtractor
import mysql.connector
if len(sys.argv) != 3:
print()
print( "Usage: getDpc year path" )
print()
print( "year : data year in western year (like 2013)" )
print( "path : the webpage URL" )
print()
sys.exit(-1)
year = int(sys.argv[1])
fullurl = sys.argv[2]
sitematcher = re.compile("(.*\.go\.jp/)(.*)");
surl = sitematcher.match(fullurl)
if surl:
siteurl = surl.group(1)
part = sys.argv[2].rpartition('/')
baseurl = part[0] + "/"
print(siteurl)
print(baseurl)
else:
sys.exit(-1)
def download_file(url):
url = siteurl + url
local_filename = 'downloads/'+url.split('/')[-1]
# NOTE the stream=True parameter
print(url)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#f.flush() commented by recommendation from J.F.Sebastian
return local_filename
matcher = re.compile(u'(MDC[0-9\-]{2,4})')
shochi = re.compile(u'処置([12])')
gaiyou = re.compile(u'施設概要表')
print("Connecting to MHLW")
r2 = requests.get(fullurl)
soup = BeautifulSoup(r2.text, "html.parser")
mydb = dbconfig.mydb
con = mysql.connector.connect(host=mydb['host'],user=mydb['user'],db=mydb['database'],charset='utf8')
print("Starting Analysis")
links = soup.find_all("a")
filetype = 0
for link in links:
href = link.attrs['href']
text = link.text
# Shisetsu Gaiyou Hyou
if gaiyou.search(text):
print("Hospital Name List")
fn = download_file(href)
dpcExtractor.getHospitals(fn, year, con)
else:
match = matcher.search(text)
if match:
s = shochi.search(text)
if s:
filetype = int(s.group(1))
else:
filetype = 0
print("Filetype ", filetype)
print(match.group(1))
print(href)
fn = download_file(href)
if filetype == 0:
dpcExtractor.getOneSheetS(fn, year, con)
elif filetype == 1:
dpcExtractor.getOneSheetT(fn, year, con, 1)
elif filetype == 2:
dpcExtractor.getOneSheetT(fn, year, con, 2)
print("---------------")