jtoh/jtoh.go at main · madlambda/jtoh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
package jtoh

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"os"
	"strings"
)

// J is a jtoh transformer, it transforms JSON into something more human
type J struct {
	separator      string
	fieldSelectors []string
}

// Err is an exported jtoh error
type Err string

// InvalidSelectorErr represents errors with the provided fields selector
const InvalidSelectorErr Err = "invalid selector"

// New creates a new jtoh transformer using the given selector.
// The selector is on the form <separator><field selector 1><separator><field selector 2>
// For example, given ":" as a separator you can define:
//
// :fieldA:fieldB:fieldC
//
// Accessing a nested field is done with dot to access nested fields, like this:
//
// :field.nested
//
// Making "." the only character that will not be allowed to be used
// as a separator since it is already a selector for nested fields.
//
// If the selector is invalid it returns an error.
func New(s string) (J, error) {
	selector := []rune(s)
	if len(selector) <= 1 {
		return J{}, fmt.Errorf("%w:%s", InvalidSelectorErr, s)
	}
	separator := string(selector[0])
	if separator == "." {
		return J{}, fmt.Errorf("%w:can't use '.' as separator", InvalidSelectorErr)
	}
	return J{
		separator:      separator,
		fieldSelectors: trimSpaces(strings.Split(string(selector[1:]), separator)),
	}, nil
}

// Do receives a json stream as input and transforms it
// in lines of text (newline-delimited) which is
// then written in the provided writer.
//
// This function will block until all data is read from the input
// and written on the output.
func (j J) Do(jsonInput io.Reader, linesOutput io.Writer) {
	jsonInput, ok := isList(jsonInput)
	// Why not bufio ? what we need here is kinda like
	// buffered io, but not exactly the same (was not able to
	// come up with a better name to it).
	bufinput := bufferedReader{r: jsonInput}
	dec := json.NewDecoder(&bufinput)

	if ok {
		// WHY: To handle properly gigantic lists of JSON objs
		// Really don't need the return value, but linters can be annoying =P
		_, _ = dec.Token()
		bufinput.reset()
	}

	var errBuffer []byte

	// TODO: Right now we have space complexity O(N) when the input is not JSON
	// For huge chunks of non JSON data this may be a problem
	for bufinput.hasData() {
		for dec.More() {
			m := map[string]interface{}{}
			err := dec.Decode(&m)
			dataUsedOnDecode := bufinput.readBuffer()
			bufinput.reset()

			if err != nil {
				errBuffer = append(errBuffer, dataUsedOnDecode...)
				dec = json.NewDecoder(&bufinput)
				continue
			}

			writeErrs(linesOutput, errBuffer)
			errBuffer = nil

			fieldValues := make([]string, len(j.fieldSelectors))
			for i, fieldSelector := range j.fieldSelectors {
				fieldValues[i] = selectField(fieldSelector, m)
			}
			fmt.Fprint(linesOutput, strings.Join(fieldValues, j.separator)+"\n")
		}
		dec = json.NewDecoder(&bufinput)
	}

	writeErrs(linesOutput, errBuffer)
}

func writeErrs(w io.Writer, errBuffer []byte) {
	if len(errBuffer) == 0 {
		return
	}
	errBuffer = append(errBuffer, '\n')
	n, err := w.Write(errBuffer)
	if err != nil {
		fmt.Fprintf(os.Stderr, "jtoh:error writing error buffer: wrote %d bytes, details: %v\n", n, err)
	}
}

func selectField(selector string, obj map[string]interface{}) string {
	const accessOp = "."

	fields := strings.Split(selector, accessOp)
	pathFields := fields[0 : len(fields)-1]
	finalField := fields[len(fields)-1]

	for _, pathField := range pathFields {
		v, ok := obj[pathField]
		if !ok {
			return missingFieldErrMsg(selector)
		}
		obj, ok = v.(map[string]interface{})
		if !ok {
			return missingFieldErrMsg(selector)
		}
	}

	v, ok := obj[finalField]
	if !ok {
		return missingFieldErrMsg(selector)
	}

	return strings.Replace(fmt.Sprint(v), "\n", "\\n", -1)
}

func missingFieldErrMsg(selector string) string {
	return fmt.Sprintf("<jtoh:missing field %q>", selector)
}

func isList(jsons io.Reader) (io.Reader, bool) {
	buf := make([]byte, 1)

	// WHY: was unable to find something like peek on json Decoder
	for {
		n, err := jsons.Read(buf)
		if err != nil {
			// FIXME: Probably would be better to fail here with a more clear error =P
			return jsons, false
		}
		if n == 0 {
			// From the docs:
			//
			// https://golang.org/pkg/io/#Reader
			//
			// Implementations of Read are discouraged from
			// returning a zero byte count with a nil error,
			// except when len(p) == 0. Callers should treat a
			// return of 0 and nil as indicating that nothing happened;
			// in particular it does not indicate EOF.
			//
			// Hope it doesn't result in some infinite loop =/
			continue
		}

		firstToken := buf[0]
		if isSpace(firstToken) {
			continue
		}

		isList := firstToken == '['
		return io.MultiReader(bytes.NewBuffer([]byte{firstToken}), jsons), isList
	}
}

func isSpace(c byte) bool {
	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}

func (e Err) Error() string {
	return string(e)
}

func trimSpaces(s []string) []string {
	trimmed := make([]string, len(s))
	for i, v := range s {
		trimmed[i] = strings.TrimSpace(v)
	}
	return trimmed
}

// bufferedReader is not exactly like the bufio on stdlib.
// The idea is to use it as a means to buffer read data
// until reset is called. We need this so when
// the JSON decoder finds an error in the stream we can retrieve
// exactly how much has been read between the last successful
// decode and the current error and echo it.
//
// To guarantee that we provide data byte per byte, which is
// not terribly efficient but was the only way so far to be sure
// (assuming that the json decoder does no lookahead) that when
// an error occurs on the json decoder we have the exact byte stream that
// caused the error (I would welcome with open arms a better solution x_x).
type bufferedReader struct {
	r       io.Reader
	buffer  []byte
	readErr error
}

func (b *bufferedReader) Read(data []byte) (int, error) {
	if len(data) == 0 {
		return 0, nil
	}

	data = data[:1]
	n, err := b.r.Read(data)

	b.readErr = err

	if n > 0 {
		b.buffer = append(b.buffer, data[0])
	}

	return n, err
}

func (b *bufferedReader) hasData() bool {
	return b.readErr == nil
}

func (b *bufferedReader) readBuffer() []byte {
	return b.buffer
}

func (b *bufferedReader) reset() {
	b.buffer = make([]byte, 0, 1024)
}