ljdump/ljmkstatic at master · vbwagner/ljdump · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/python
# -*- encoding: utf-8 -*-
"""
Formats two files, downloaded by ljdump.py into HTML page using
template.

Syntax ljformatxml -t template -o file.html L-nnnn C-nnnn

"""
from ConfigParser import ConfigParser
import xml.dom.minidom, xml.dom
import re
import sys,os,glob
import codecs
# Параметры конфигурации
# Директория для симлинков
# url картинки с человечком
# Директория с результатами дампа
# директория для картинки
# шаблон для страницы поста
# шаблон для блока комментария
template={}
urls={}
dirs={}
def read_templates(config):
    global template
    for i in ['post','comment']:
        with codecs.open(config.get('templates',i),'r','utf-8') as f:
            template[i]=f.read()


def set_parameters(config):
    global urls,dirs
    for i in config.options('urls'):
        urls[i]=config.get('urls',i)
    for i in config.options('directories'):
        dirs[i]=config.get('directories',i)

def process_ljtag(m):
    """
    Receives lj tag match object with lj tag and returns
    html text which should be used as replacement
    Used to pass into re.sub
    """
    tag = m.group(0)
    if tag.find("lj-cut")!=-1:
        return ""
    if tag.find('user=')!=-1:
        name= re.search('user=[\'\"]?(\w+)[\'\"]?',tag).group(1)
        title = re.search('title=[\"\']?([^"\'>]+)[\'\"]?',tag)
        if title:
            title = title.group(1)
        else:
            title=name
        tag= '<a style="color: blue; font-weight: bold;" href="http://www.livejournal.com/users/%s/profile"><img src="%s">%s</a>'%(name,urls['icons']+"/userinfo.gif",title)
        return tag
    print "unknown lj tag: ",tag

def process_text(text):
      # Выделить оттуда текст, распарсить как html, заменяя lj-тэги
      try:
        text = re.sub("</?lj[^>]+>",process_ljtag,text)
      except Exception as e:
        print 'bad text :',text
        raise e
      text = re.sub("\r?\n","<br>",text)
      # и заменяя img на локальные копии, если они есть. Если нет, писать в
      # кеш картинок
      # FIXME post_props[post_text] = re.sub("<img # [^>]+>",process_img.post_text)
      return text
def format_comments(cmt_list):
    out=[]
    for cmt in cmt_list:
        if len(cmt['children']):
            cmt['comments']=format_comments(cmt['children'])
        else:
            cmt['comments']=''
        if 'user' in cmt:
            cmt['userlink']=process_text('<lj user="%s">'%cmt['user'])
        out.append(template['comment'] % cmt)
    return ''.join(out)

def do_post(postfile,commentfile,outputfile):
    """
    Handles one post. Returns post date, url, subject and tag list
    """
# Прочитать L-nnnn
    post_xml = xml.dom.minidom.parse(postfile)
    post_props = {'subject':'','taglist':''}
    for n in post_xml.documentElement.childNodes:
        if n.nodeType == xml.dom.Node.ELEMENT_NODE:
            if n.nodeName == u'event':
                post_props['text']=process_text(n.firstChild.nodeValue)
            elif n.nodeName == u'props':
            # Выделить необходимую метаинформацию
                for n2 in n.childNodes:
                    if n2.nodeType == xml.dom.Node.ELEMENT_NODE:
                        post_props[str(n2.nodeName)] = n2.firstChild.nodeValue
            else:
                post_props[str(n.nodeName)] = n.firstChild.nodeValue

    if not 'text' in post_props:
        raise ValueError("No event node in ths post")
    if 'picture_keyword' in post_props:
        userpic=post_props['picture_keyword']
    else:
        userpic='_'
    for fmt in ('jpg','gif','png'):
        if os.access("%s/%s.%s" % (dirs['archive'],userpic,fmt),os.R_OK):
            post_props['userpic']='%s/userpics/%s.%s'%(urls['images'],userpic,fmt)
            break
    if commentfile:
        comment_xml = xml.dom.minidom.parse(  commentfile)
    # We suppose that comments are already sorted accoridng to post time
        comment_tree = []
        comment_hash = {}
        comment_count = 0
        for c in comment_xml.documentElement.childNodes:
            if c.nodeType != xml.dom.Node.ELEMENT_NODE or c.nodeName != 'comment':
                continue
            comment={'date':'Unknown','children':[],'subject':'','userlink':'(Anonymous)'}
            for i in c.childNodes:
                if i.nodeType != xml.dom.Node.ELEMENT_NODE:
                    continue
                if i.nodeName == 'body':
                    if i.firstChild is None:
                        comment['body']='<b>Deleted comment</b>'
                    else:
                        comment['body']=process_text(i.firstChild.nodeValue)
                else:
                    tx=i.firstChild
                    if tx:
                        comment[str(i.nodeName)]=tx.nodeValue
            comment_hash[comment['id']]=comment
            if 'parentid' in comment and comment['parentid'] in comment_hash:
                comment_hash[comment['parentid']]['children'].append(comment)
                comment_count +=1
            else:
                comment_tree.append(comment)

        post_props['comments'] = format_comments(comment_tree)
        post_props['comment_count'] = comment_count
    else:
        post_props['comments'] = ''
        post_props['comment_count'] = 0
    page = template['post']%post_props

    with codecs.open(outputfile,"w","utf-8") as f :
        f.write(page)
    return (post_props['logtime'],post_props['ditemid'],post_props['subject'],post_props['taglist'])


if __name__ == '__main__':
    config=ConfigParser()
    if config.read(["ljmkstatic.conf"]) < 1:
        raise ValueError("No config file found")
    read_templates(config)
    set_parameters(config)
    for post_file in sorted(glob.glob(dirs['dump']+"/L-*")):
        post_id = re.search("(\d+)$",post_file).group(1)
        comment_file = dirs['dump']+"/C-"+post_id
        outfile=dirs['dump']+"/"+post_id+".html"
        try:
            t1=os.stat(post_file).st_mtime
            try:
                t2=os.stat(comment_file).st_mtime
            except OSError:
                t2=0
                comment_file = None
            t3=os.stat(outfile).st_mtime
            if t3 > t1 and t3 > t2:
                continue
        except OSError:
            pass
        print "Processing post L-%s"%post_id
        (date,post_id,subject,tags) = do_post(post_file,comment_file,outfile)
        # Fix me - update index structures