pywikibot-scripts/wiki2html.py at master · DriedFishMatters/pywikibot-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""wiki2html (pywikibot script)

Script for exporting wiki pages in a given category to static html pages
for public distribution

USAGE:

    python pwb.py wiki2html [options]

OPTIONS:

  -category:CATEGORY (required)
    The name of the category to list pages from
  -out:PATH (required)
    The path on disk to the directory for output
  -base:BASENAME (required)
    The base path for urls in HTML output
  -sitename:SITENAME (required)
    The name of the site, to be included in the html header
  -template:TEMPLATE (required)
    HTML template with python template fields, to be used in generating
    the output. See wiki2html_sample-web-template.txt. The required variables
    are sitename, title, and content (i.e., page body text).

"""

import os
import pywikibot
from slugify import slugify
from bs4 import BeautifulSoup
import requests
import urllib


INDEX_TEMPLATE = """<dt><a href="{url}">{title}</a></dt>
<dd class="extract">{extract}</dd>
"""

# use these to avoid being blocked by mod_security
# we download images directly instead of via the API
HTTP_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}

def index(site, titles):
    out = []
    titles = [title for title in titles if not title.startswith('File:')]
    extracts = pywikibot.data.api.PropertyGenerator(
            site=site,
            prop='extracts|info',
            titles=titles,
            exsentences=5,
            exintro=1,
            exsectionformat='plain',
            explaintext=1)

    for e in extracts:
        e['url'] = slugify(e['title']) + '.html'
        out.append(INDEX_TEMPLATE.format(**e))

    return ' '.join(out)

def postprocess(html, titles, out, basename):
    soup = BeautifulSoup(html, 'html.parser')
    for elem, attr in [('div', 'magnify'), ('span', 'mw-editsection')]:
        for e in soup.find_all(elem, attr):
            e.decompose()
    # remove internal links if not in list
    for a in soup.find_all('a'):
        if a['href'].startswith('#'):
            continue
        if 'external' in a.get('class', ''):
            continue
        if 'homepage' in a.get('class', ''):
            continue
        title = a.get('title', None)
        if title and title in titles:
            a['href'] = slugify(a['title']) + '.html'
        else:
            a.unwrap()
    for img in soup.find_all('img'):
        src = img.get('src', None)
        if not src:
            continue
        filename = os.path.basename(urllib.parse.unquote(src))
        local_filename = os.path.join(out, filename)
        if not os.path.exists(local_filename):
            remote_url = urllib.parse.urljoin(basename, src)
            print("Retrieving {}...".format(remote_url))
            data = requests.get(remote_url, headers=HTTP_HEADERS)
            with open(local_filename, 'wb') as img_file:
                img_file.write(data.content)
        img['src'] = urllib.parse.quote(filename)
    return str(soup)


def run(*args):
    required = ['category', 'out', 'base', 'sitename', 'template']
    options = {}

    local_args = pywikibot.handle_args(args)
    for arg in local_args:
        option, sep, value = arg.partition(':')
        options[option.strip('-')] = value
    for option in required:
        if not options.get(option, False):
            value = pywikibot.input('Please enter a value for ' + option)
            options[option] = value
    with open(options['template'], 'r') as template_file:
        tpl = template_file.read()
    site = pywikibot.Site()
    cat = pywikibot.page.Category(site, options['category'])
    pages = cat.articles()
    titles = [page.title() for page in pages]
    # reset the generator
    pages = cat.articles()

    with open(os.path.join(options['out'], 'index.html'), 'w',
                encoding='utf-8') as h:
        h.write(tpl.format(title=options['sitename'],
                content=index(site, titles), **options ))

    for page in pages:
        print('Processing {}...'.format(page.title()))
        html = tpl.format( title=page.title(),
                content=page.get_parsed_page(),
                **options )
        html = postprocess(html, titles, options['out'], options['base'])
        pagename = slugify(page.title()) + '.html'
        with open(os.path.join(options['out'], pagename), 'w', encoding='utf-8') as h:
            h.write(html)


if __name__ == '__main__':
    run()