md2docx/md2docx.py at master · Rasek91/md2docx · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
from argparse import ArgumentParser
from docx import Document, opc, oxml
from docx.enum.dml import MSO_THEME_COLOR_INDEX
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Inches, RGBColor
from glob import glob
from html.parser import HTMLParser
from io import BytesIO
from mistune import markdown
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import get_lexer_by_name
from requests import get
from os import path
from yaml import safe_load

class MDConverter(HTMLParser):
    """
    Create docx file from a list of MD files.

    Attributes:
      - config: the name of the YAML config file
    """

    #init function
    def __init__(self, config):
        #load the config file
        self.parse_config(config)
        #variables to save the tags during handle_data
        self.highlighting = False
        self.paragraph = None
        self.style = None
        self.last = None
        self.last_list_style = None
        self.link = None
        self.table = None
        self.table_row = 0
        self.table_column = 0
        self.list_level = 0
        #list of tags to handle
        self.type_tags = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "pre", "table"]
        self.list_tags = ["ol", "ul"]
        self.style_tags = ["code", "del", "em", "strong"]
        self.table_tags = ["th", "td", "tr"]
        self.link_tags = ["a"]
        #load the template document
        self.document = Document(self.config["Template"])
        #create a codehighlighter instance
        self.codehighlighter = CodeHighlighter(self.document)
        #call the init function of the super
        super().__init__()

    #function to parse config file
    def parse_config(self, config):
        with open(config, "r") as stream:
            #safe load the YAML config file content
            self.config = safe_load(stream)

    #function to do the parsing
    def do(self):
        self.parse_mds()
        self.generate()

    #function to parse MD files
    def parse_mds(self):
        #create a sorted list of MD files
        mds = glob(path.join(self.config["Folder"], "*.md"))
        mds.sort()

        #parse all the files
        for filename in mds:
            self.parse_md(filename)

    #function to parse one MD file
    def parse_md(self, filename):
        #create HTML from MD file
        parsed = markdown(open(filename).read())
        #feed the HTML version to the HTMLParser
        self.feed(parsed)

    #function to add paragraph
    def create_paragraph(self, tag):
        #create haeding paragraph
        if 'h' in self.paragraph:
            self.last = self.document.add_heading("", level = int(self.paragraph[1]))
        #create a new paragraph
        elif self.paragraph == 'p':
            self.last = self.document.add_paragraph("")
        #create a code paragraph
        elif self.paragraph == 'pre':
            self.last = self.document.add_paragraph("", style = "code")
        #create unordered list paragraph
        elif self.paragraph == 'ul':
            if tag == 'ul':
                self.list_level += 1
            elif tag == 'li':
                self.last = self.document.add_paragraph("", style = "unordered list{}".format(self.list_level))
        #create unordered list paragraph
        elif self.paragraph == 'ol':
            if tag == 'ol':
                self.list_level += 1
            elif tag == 'li':
                self.last = self.document.add_paragraph("", style = "ordered list{}".format(self.list_level))
        #create a new table
        elif self.paragraph == 'table':
            self.table = self.document.add_table(rows = 1, cols = 1, style = self.config["Table Style"])

            for cell in self.table.columns[0].cells:
                cell.width = Cm(2.0)

            self.table.autofit = True

    #function to add hyperlink
    #https://stackoverflow.com/a/47666747
    def add_hyperlink(self, text, url):
        # This gets access to the document.xml.rels file and gets a new relation id value
        part = self.last.part
        r_id = part.relate_to(url, opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external = True)
        # Create the w:hyperlink tag and add needed values
        hyperlink = oxml.shared.OxmlElement("w:hyperlink")
        hyperlink.set(oxml.shared.qn("r:id"), r_id, )
        # Create a w:r element and a new w:rPr element
        new_run = oxml.shared.OxmlElement("w:r")
        rPr = oxml.shared.OxmlElement("w:rPr")
        # Join all the xml elements together add add the required text to the w:r element
        new_run.append(rPr)
        new_run.text = text
        hyperlink.append(new_run)
        # Create a new Run object and add the hyperlink into it
        r = self.last.add_run()
        r._r.append(hyperlink)
        # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
        # Delete this if using a template that has the hyperlink style in it
        r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
        r.font.underline = True

    #function to add horizontal ruler
    #https://github.com/python-openxml/python-docx/issues/105#issuecomment-442786431
    def inserthr(self):
        p = self.document.paragraphs[-1]._p  # p is the <w:p> XML element
        pPr = p.get_or_add_pPr()
        pBdr = OxmlElement("w:pBdr")
        pPr.insert_element_before(pBdr,
            "w:shd", "w:tabs", "w:suppressAutoHyphens", "w:kinsoku", "w:wordWrap",
            "w:overflowPunct", "w:topLinePunct", "w:autoSpaceDE", "w:autoSpaceDN",
            "w:bidi", "w:adjustRightInd", "w:snapToGrid", "w:spacing", "w:ind",
            "w:contextualSpacing", "w:mirrorIndents", "w:suppressOverlap", "w:jc",
            "w:textDirection", "w:textAlignment", "w:textboxTightWrap",
            "w:outlineLvl", "w:divId", "w:cnfStyle", "w:rPr", "w:sectPr",
            "w:pPrChange"
        )
        bottom = OxmlElement("w:bottom")
        bottom.set(qn("w:val"), "single")
        bottom.set(qn("w:sz"), "6")
        bottom.set(qn("w:space"), "1")
        bottom.set(qn("w:color"), "auto")
        pBdr.append(bottom)

    #function to add style to the last run
    def add_style(self):
        #get the font of the last run
        font = self.last.runs[-1].font

        #bold
        if self.style == 'strong':
            font.bold = True
        #italic
        elif self.style == 'em':
            font.italic = True
        #strike
        elif self.style == 'del':
            font.strike = True
        #inline code
        elif self.paragraph != 'pre' and self.style == 'code':
            font.name = self.config["Inline Code Font"]
            font.color.rgb = RGBColor(int(self.config["Inline Code Color"][0:2], 16), int(self.config["Inline Code Color"][2:4], 16), int(self.config["Inline Code Color"][4:], 16))

    #function to add image to the document
    def add_image(self, image):
        #download the image if it is a link
        if image.startswith("http://") or image.startswith("https://"):
            response = get(image)
            image = BytesIO(response.content)

        if self.config["Unit of Image Size"].lower() == 'cm':
            self.document.add_picture(image, width = Cm(self.config["Image Size"]))
        elif self.config["Unit of Image Size"].lower() == 'inches':
            self.document.add_picture(image, width = Inches(self.config["Image Size"]))
        else:
            raise Exception("The unit of the image size should be cm or inches.")

    #function to handle start of an HTML tag
    def handle_starttag(self, tag, attrs):
        #add a new paragraph
        if (tag in self.type_tags or tag in self.list_tags) and self.highlighting == False:
            if self.paragraph == None:
                self.paragraph = tag

            self.create_paragraph(tag)
        #add code syntax highlight
        elif tag == 'code' and self.paragraph == 'pre' and len(attrs) != 0 and attrs[0][0] == 'class':
            self.highlighting = [attrs[0][1].split("-")[-1], self.config["Syntax Highlight Style"]]
        #add new line to a list
        elif tag == 'li':
            self.create_paragraph(tag)
        #save style tag
        elif tag in self.style_tags:
            self.style = tag
        #save link tag
        elif tag in self.link_tags:
            self.link = [tag, next(link[1] for link in attrs if link[0] == "href")]
        #add horizontal ruler
        elif tag == 'hr':
            self.inserthr()
        #add image
        elif tag == 'img':
            self.add_image(next(link[1] for link in attrs if link[0] == "src"))
        #save table tag
        elif tag in self.table_tags:
            self.paragraph = tag

            if tag == 'tr' and self.table_row != 0:
                self.table.add_row()

    #function to handle end of an HTML tag
    def handle_endtag(self, tag):
        #reset list tags
        if tag in self.list_tags:
            self.list_level -= 1

            if tag == 'ol':
                self.last_list_style = None

            if self.list_level == 0:
                self.paragraph = None
                self.last = None
        #reset code syntax highlighter
        elif tag == 'code' and self.paragraph == 'pre' and self.highlighting != False:
            self.highlighting = False
        #reset paragraph tags
        elif tag in self.type_tags and tag == self.paragraph:
            self.paragraph = None
            self.last = None
        #reset table
        elif tag == 'table':
            #set table lenght to not be full page lenght
            #https://github.com/python-openxml/python-docx/issues/315#issuecomment-239259678
            for r in self.table.rows:
                for c in r._tr.tc_lst:
                    tcW = c.tcPr.tcW
                    tcW.type = 'auto'
                    tcW.w = 0

            self.table = None
            self.table_row = 0
        #reset style tags
        elif tag in self.style_tags:
            self.style = None
        #reset link tags
        elif tag in self.link_tags:
            self.link = None
        #reset table tags
        elif tag in self.table_tags:
            self.paragraph = None

            if tag == 'tr':
                self.table_column = 0
                self.table_row += 1

    #function to handle data inside an HTML tag
    def handle_data(self, data):
        #add text to not a table
        if self.table == None:
            #add text if it not a new line only
            if data != '\n' and data:
                #do code syntax highlighting
                if self.highlighting != False:
                    self.codehighlighter.feed(highlight(data, get_lexer_by_name(self.highlighting[0]), HtmlFormatter(style = self.highlighting[1], noclasses = True)))
                #add text to the last paragraph
                elif self.last != None:
                    #add simple text
                    if self.link == None:
                        self.last.add_run(data)
                    #add link
                    elif self.link[0] == 'a':
                        self.add_hyperlink(data, self.link[1])

                #add style to the last text
                if self.style != None:
                    self.add_style()
        #add text to a table
        else:
            #merge cell to the upper one
            if data == '^^':
                self.table.rows[self.table_row].cells[self.table_column].merge(self.table.rows[self.table_row - 1].cells[self.table_column])
                self.table_column += 1
            #merge cell to the left one
            elif data == '<<':
                self.table.rows[self.table_row].cells[self.table_column].merge(self.table.rows[self.table_row].cells[self.table_column - 1])
                self.table_column += 1
            #add text to the header
            elif self.paragraph == 'th':
                #add new column if needed
                if self.table_column != 0:
                    self.table.add_column(width = Cm(2.0))

                self.table.rows[self.table_row].cells[self.table_column].text = data
                self.table_column += 1
            #add text to the table body
            elif self.paragraph == 'td':
                self.table.rows[self.table_row].cells[self.table_column].text = data
                self.table_column += 1

    #function to generate the docx file from the tamplate
    def generate(self):
        #call the function of Document to save the new file
        self.document.save(self.config["Filename"])

    #delete function
    def __del__(self):
        pass

class CodeHighlighter(HTMLParser):
    """
    Add code syntax highlighting to a code paragraph.

    Attributes:
      - document: document where the highlighting needed
    """

    #init function
    def __init__(self, document):
        self.document = document
        #call the init function of the super
        super().__init__()

    #function to handle start of an HTML tag
    def handle_starttag(self, tag, attrs):
        #only span tags are in the generated document
        if tag == 'span':
            #if there is highlighting change the font
            if len(attrs) != 0:
                styles = next(attribute[1] for attribute in attrs if attribute[0] == "style")
                font = self.document.paragraphs[-1].add_run().font

                for style in styles.split(";"):
                    if style.split(": ")[0] == 'color':
                        color = style.split(": ")[1][1:]
                        font.color.rgb = RGBColor(int(color[0:2], 16), int(color[2:4], 16), int(color[4:], 16))
                    elif style.split(": ")[0] == 'font-weight' and style.split(": ")[1] == 'bold':
                        font.bold = True
            #if there is no highlighting add an unchanged font
            else:
                self.document.paragraphs[-1].add_run()

    #function to handle end of an HTML tag
    def handle_endtag(self, tag):
        pass

    #function to handle data inside an HTML tag
    def handle_data(self, data):
        for character in data:
            #add break if needed
            if character == '\n':
                self.document.paragraphs[-1].runs[-1].add_break()
            #add the text to the last run
            else:
                self.document.paragraphs[-1].runs[-1].add_text(character)

    #delete function
    def __del__(self):
        pass

#function to parse the arguments
def parse_arguments():
    #create instance of ArgumentParser class
    parser = ArgumentParser(description = "MD 2 DOCX Converter")
    #reset the groups
    parser._action_groups.pop()
    #add new required group
    required = parser.add_argument_group("Required Arguments")
    #add the config parser
    required.add_argument("-c", "--config", type = str, required = True, help = "The configuration file of MD 2 DOCX Converter")
    #parse arguments
    arguments = parser.parse_args()

    #return with the config file name
    return arguments.config

#main function
def main():
    #parse configuration
    config = parse_arguments()
    #create instance of MDConverter class
    md2docx = MDConverter(config)
    #do the converting
    md2docx.do()

#run main if the script call directly
if __name__ == '__main__':
    main()