This repository was archived by the owner on Oct 29, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcdxj.go
More file actions
256 lines (217 loc) · 8.17 KB
/
cdxj.go
File metadata and controls
256 lines (217 loc) · 8.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
// Package cdxj implements the CDXJ file format used by OpenWayback 3.0.0 (and later) to index web archive contents
// (notably in WARC and ARC files) and make them searchable via a resource resolution service.
// The format builds on the CDX file format originally developed by the Internet Archive
// for the indexing behind the WaybackMachine.
// This specification builds on it by simplifying the primary fields while adding a flexible JSON 'block'
// to each record, allowing high flexiblity in the inclusion of additional data.
package cdxj
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/purell"
"github.com/datatogether/warc"
)
// CanonicalizationScheme is the default method this package uses to canonicalize urls
var CanonicalizationScheme = purell.FlagsSafe
// Record is an entry in a cdxj index, consisting of uri, timestamp, recordtype, and metadata fields
// Following the header lines, each additional line should represent exactly one resource in a web archive.
// Typically in a WARC (ISO 28500) or ARC file, although the exact storage of the resource is not defined
// by this specification. Each such line shall be refered to as a *record*.
type Record struct {
// Searchable URI
// By *searchable*, we mean that the following transformations have been applied to it:
// 1. Canonicalization - See Appendix A
// 2. Sort-friendly URI Reordering Transform (SURT)
// 3. The scheme is dropped from the SURT format
URI string
// should correspond to the WARC-Date timestamp as of WARC 1.1.
// The timestamp shall represent the instant that data capture for record
// creation began.
// All timestamps should be in UTC.
Timestamp time.Time
// Indicates what type of record the current line refers to.
// This field is fully compatible with WARC 1.0 definition of
// WARC-Type (chapter 5.5 and chapter 6).
RecordType warc.RecordType
// This should contain fully valid JSON data. The only limitation, beyond those
// imposed by JSON encoding rules, is that this may not contain any newline
// characters, either in Unix (0x0A) or Windows form (0x0D0A).
// The first occurance of a 0x0A constitutes the end of this field (and the record).
JSON map[string]interface{}
}
// NewResponseRecord is a convenience method to create a record with the Response record type
func NewResponseRecord(url string, ts time.Time, data map[string]interface{}) *Record {
return NewRecord(url, ts, warc.RecordTypeResponse, data)
}
// NewResourceRecord is a convenience method to create a record with the Resource record type
func NewResourceRecord(url string, ts time.Time, data map[string]interface{}) *Record {
return NewRecord(url, ts, warc.RecordTypeResource, data)
}
// NewRequestRecord is a convenience method to create a record with the RequestR record type
func NewRequestRecord(url string, ts time.Time, data map[string]interface{}) *Record {
return NewRecord(url, ts, warc.RecordTypeRequest, data)
}
// NewMetadataRecord is a convenience method to create a record with the Metadata record type
func NewMetadataRecord(url string, ts time.Time, data map[string]interface{}) *Record {
return NewRecord(url, ts, warc.RecordTypeMetadata, data)
}
// NewRevisitRecord is a convenience method to create a record with the RevisitR record type
func NewRevisitRecord(url string, ts time.Time, data map[string]interface{}) *Record {
return NewRecord(url, ts, warc.RecordTypeRevisit, data)
}
// NewRecord creates a new cdxj record
func NewRecord(url string, ts time.Time, rt warc.RecordType, data map[string]interface{}) *Record {
can, err := CanonicalizeURL(url)
if err != nil {
can = url
}
surt, err := SurtURL(can)
if err != nil {
surt = url
}
return &Record{
URI: surt,
Timestamp: ts,
RecordType: rt,
JSON: data,
}
}
// NewRecordFromWARCRecord generates a cdxj record from a WARC record
func NewRecordFromWARCRecord(rec *warc.Record) (*Record, error) {
can, err := CanonicalizeURL(rec.TargetURI())
if err != nil {
return nil, err
}
surt, err := SurtURL(can)
if err != nil {
return nil, err
}
return &Record{
URI: surt,
Timestamp: rec.Date(),
RecordType: rec.Type,
JSON: map[string]interface{}{},
}, nil
}
// UnmarshalCDXJ reads a cdxj record from a byte slice
func (r *Record) UnmarshalCDXJ(data []byte) (err error) {
rdr := bytes.NewReader(data)
buf := bufio.NewReader(rdr)
surturl, err := buf.ReadString(' ')
if err != nil {
return err
}
r.URI, err = UnSurtURL(surturl)
if err != nil {
return err
}
ts, err := buf.ReadString(' ')
if err != nil {
return err
}
r.Timestamp, err = time.Parse(time.RFC3339, strings.TrimSpace(ts))
if err != nil {
return err
}
rt, err := buf.ReadString(' ')
if err != nil {
return err
}
r.RecordType = warc.ParseRecordType(strings.TrimSpace(rt))
r.JSON = map[string]interface{}{}
if err := json.NewDecoder(buf).Decode(&r.JSON); err != nil {
return err
}
return nil
}
// MarshalCDXJ outputs a CDXJ representation of r
func (r *Record) MarshalCDXJ() ([]byte, error) {
jb, err := json.Marshal(r.JSON)
if err != nil {
return nil, err
}
return []byte(fmt.Sprintf("%s %s %s %s\n", r.URI, r.Timestamp.In(time.UTC).Format(time.RFC3339), r.RecordType, string(jb))), nil
}
// CanonicalizeURL takes raw url strings & returns their normalized version
// Canonicalization is applied to URIs to remove trivial
// differences in the URIs that do not reflect that the
// URI reference different resources.
// Examples include removing session ID parameters,
// unneccessary port declerations (e.g. :80 when crawling HTTP).
// OpenWayback implements its own canonicalization process.
// Typically, it will be applied to the searchable URIs in CDXJ files. You can,
// however, use any canonicalization scheme you care for (including none).
// You must simply ensure that the same canonicalization process is
// applied to the URIs when performing searches.
// Otherwise they may not match correctly.
func CanonicalizeURL(rawurl string) (string, error) {
return purell.NormalizeURLString(rawurl, CanonicalizationScheme)
}
// SurtURL is a transformation applied to URIs which makes their left-to-right
// representation better match the natural hierarchy of domain names.
// A URI `<scheme://domain.tld/path?query>` has SURT form `<scheme://(tld,domain,)/path?query>`.
// Conversion to SURT form also involves making all characters lowercase,
// and changing the 'https' scheme to 'http'. Further, the '/' after a URI authority component --
// for example, the third slash in a regular HTTP URI -- will only appear in the SURT
// form if it appeared in the plain URI form.
func SurtURL(rawurl string) (string, error) {
rawurl = strings.ToLower(rawurl)
// TODO - if the query param contains a url of some kind, and the scheme is missing
// this will fail, probably going to need to use regex :/
// if !strings.Contains(rawurl, "://") {
// rawurl = "http://" + rawurl
// }
u, err := url.Parse(rawurl)
if err != nil {
return rawurl, err
}
s := strings.Split(u.Hostname(), ".")
reverseSlice(s)
surt := fmt.Sprintf("(%s,)%s", strings.Join(s, ","), u.Path)
if u.RawQuery != "" {
surt += fmt.Sprintf("?%s", u.RawQuery)
}
// surt += ">"
return surt, nil
}
// UnSurtURL turns a SURT'ed url back into a normal Url
// TODO - should accept SURT urls that contain a scheme
func UnSurtURL(surturl string) (string, error) {
surturl = strings.Trim(surturl, "(> \n")
buf := strings.NewReader(surturl)
s := bufio.NewReader(buf)
base, err := s.ReadString(')')
if err != nil {
return surturl, err
}
sl := strings.Split(strings.Trim(base, ",)"), ",")
reverseSlice(sl)
hostname := strings.Join(sl, ".")
return fmt.Sprintf("%s%s", hostname, surturl[len(base):]), nil
}
// UnSurtPath gives the path element of a SURT'ed url
func UnSurtPath(surturl string) (string, error) {
surturl = strings.Trim(surturl, "(> \n")
buf := strings.NewReader(surturl)
s := bufio.NewReader(buf)
base, err := s.ReadString(')')
if err != nil {
return surturl, err
}
path := surturl[len(base):]
if len(path) == 0 || path[0] != '/' {
path = "/" + path
}
return path, nil
}
// reverseSlice reverses a slice of strings
func reverseSlice(s []string) {
for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
s[i], s[j] = s[j], s[i]
}
}