Skip to content

Commit 1f11500

Browse files
authored
add use_of_customfuncs.md (#125)
Add use_of_customfuncs.md Also fix a couple lint issues by unexporting constants (that don't need to be exported)
1 parent f0d8160 commit 1f11500

File tree

12 files changed

+455
-207
lines changed

12 files changed

+455
-207
lines changed

customfuncs/datetime.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ func DateTimeLayoutToRFC3339(_ *transformctx.Ctx, datetime, layout, layoutTZ, fr
114114
}
115115

116116
const (
117-
EpochUnitMilliseconds = "MILLISECOND"
118-
EpochUnitSeconds = "SECOND"
117+
epochUnitMilliseconds = "MILLISECOND"
118+
epochUnitSeconds = "SECOND"
119119
)
120120

121121
// DateTimeToEpoch parses a 'datetime' string intelligently, and returns its epoch number. 'fromTZ'
@@ -131,9 +131,9 @@ func DateTimeToEpoch(_ *transformctx.Ctx, datetime, fromTZ, unit string) (string
131131
return "", err
132132
}
133133
switch unit {
134-
case EpochUnitMilliseconds:
134+
case epochUnitMilliseconds:
135135
return strconv.FormatInt(t.UnixNano()/int64(time.Millisecond), 10), nil
136-
case EpochUnitSeconds:
136+
case epochUnitSeconds:
137137
return strconv.FormatInt(t.Unix(), 10), nil
138138
default:
139139
return "", fmt.Errorf("unknown epoch unit '%s'", unit)
@@ -163,9 +163,9 @@ func EpochToDateTimeRFC3339(_ *transformctx.Ctx, epoch, unit string, tz ...strin
163163
}
164164
var t time.Time
165165
switch unit {
166-
case EpochUnitSeconds:
166+
case epochUnitSeconds:
167167
t = time.Unix(n, 0)
168-
case EpochUnitMilliseconds:
168+
case epochUnitMilliseconds:
169169
t = time.Unix(0, n*(int64(time.Millisecond)))
170170
default:
171171
return "", fmt.Errorf("unknown epoch unit '%s'", unit)

customfuncs/datetime_test.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -216,23 +216,23 @@ func TestDateTimeToEpoch(t *testing.T) {
216216
name: "empty datetime -> no op",
217217
datetime: "",
218218
fromTZ: "UTC",
219-
unit: EpochUnitMilliseconds,
219+
unit: epochUnitMilliseconds,
220220
err: "",
221221
expected: "",
222222
},
223223
{
224224
name: "invalid datetime",
225225
datetime: "invalid",
226226
fromTZ: "UTC",
227-
unit: EpochUnitSeconds,
227+
unit: epochUnitSeconds,
228228
err: "unable to parse 'invalid' in any supported date/time format",
229229
expected: "",
230230
},
231231
{
232232
name: "invalid fromTZ",
233233
datetime: "2020/09/22T12:34:56",
234234
fromTZ: "invalid",
235-
unit: EpochUnitMilliseconds,
235+
unit: epochUnitMilliseconds,
236236
err: "unknown time zone invalid",
237237
expected: "",
238238
},
@@ -248,23 +248,23 @@ func TestDateTimeToEpoch(t *testing.T) {
248248
name: "datetime no tz; no fromTZ",
249249
datetime: "2020/09/22T12:34:56",
250250
fromTZ: "",
251-
unit: EpochUnitMilliseconds,
251+
unit: epochUnitMilliseconds,
252252
err: "",
253253
expected: "1600778096000",
254254
},
255255
{
256256
name: "datetime no tz; with fromTZ",
257257
datetime: "2020/09/22T12:34:56",
258258
fromTZ: "America/Los_Angeles",
259-
unit: EpochUnitSeconds,
259+
unit: epochUnitSeconds,
260260
err: "",
261261
expected: "1600803296",
262262
},
263263
{
264264
name: "datetime with tz; with fromTZ",
265265
datetime: "2020/09/22T12:34:56-05",
266266
fromTZ: "America/Los_Angeles",
267-
unit: EpochUnitSeconds,
267+
unit: epochUnitSeconds,
268268
err: "",
269269
expected: "1600796096",
270270
},
@@ -295,31 +295,31 @@ func TestEpochToDateTimeRFC3339(t *testing.T) {
295295
{
296296
name: "empty epoch -> no op",
297297
epoch: "",
298-
unit: EpochUnitMilliseconds,
298+
unit: epochUnitMilliseconds,
299299
tz: nil,
300300
err: "",
301301
expected: "",
302302
},
303303
{
304304
name: "more than one tz specified",
305305
epoch: "1234567",
306-
unit: EpochUnitMilliseconds,
306+
unit: epochUnitMilliseconds,
307307
tz: []string{"UTC", "UTC"},
308308
err: "cannot specify tz argument more than once",
309309
expected: "",
310310
},
311311
{
312312
name: "invalid epoch",
313313
epoch: "invalid",
314-
unit: EpochUnitSeconds,
314+
unit: epochUnitSeconds,
315315
tz: nil,
316316
err: `strconv.ParseInt: parsing "invalid": invalid syntax`,
317317
expected: "",
318318
},
319319
{
320320
name: "invalid tz",
321321
epoch: "12345",
322-
unit: EpochUnitSeconds,
322+
unit: epochUnitSeconds,
323323
tz: []string{"invalid"},
324324
err: "unknown time zone invalid",
325325
expected: "",
@@ -335,15 +335,15 @@ func TestEpochToDateTimeRFC3339(t *testing.T) {
335335
{
336336
name: "no tz",
337337
epoch: "1234567890123",
338-
unit: EpochUnitMilliseconds,
338+
unit: epochUnitMilliseconds,
339339
tz: nil,
340340
err: "",
341341
expected: "2009-02-13T23:31:30Z",
342342
},
343343
{
344344
name: "with tz",
345345
epoch: "1234567890",
346-
unit: EpochUnitSeconds,
346+
unit: epochUnitSeconds,
347347
tz: []string{"America/Los_Angeles"},
348348
err: "",
349349
expected: "2009-02-13T15:31:30-08:00",

doc/use_of_custom_funcs.md

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# Use of `custom_func`, Specially `javascript`
2+
3+
`custom_func` is a transform that allows schema writer to alter, compose, transform and aggregate existing
4+
data from the input. Among [all `custom_func`](./customfuncs.md), [`javascript`](TODO) is the most important
5+
one to understand and master.
6+
7+
A `custom_func` has 4 basic parts: `xpath`/`xpath_dynamic`, `name`, `args`, and `type`.
8+
9+
Like any other transforms, `custom_func` uses optional `xpath`/`xpath_dynamic` directive to move the current
10+
IDR tree cursor. See [here](xpath.md#data-context-and-anchoring) for more details.
11+
12+
`name` is self-explanatory.
13+
14+
`args` is a list of arguments, which themselves are transforms recursively, to the function.
15+
16+
Optional `type` indicates a result type cast is needed. Valid types are `'string'`, `'int'`, `'float'`,
17+
and `'boolean'`. Not specifying `type` tells omniparser to keep whatever type of the result from the
18+
`custom_func` as is.
19+
20+
## Basic Examples
21+
22+
1. Fixed Argument List
23+
24+
Look at the following transform example:
25+
```
26+
"carrier": { "custom_func": { "name": "lower", "args": [ { "xpath": "./CARRIER_NAME" } ] } },
27+
```
28+
This transform, in English, takes the value of the immediate child node `CARRIER_NAME` from the current
29+
IDR tree cursor position, and returns it in lower-case.
30+
31+
2. Variable Argument List
32+
33+
Look at the following transform example (adapted from
34+
[here](../extensions/omniv21/samples/fixedlength/2_multi_rows.schema.json)):
35+
```
36+
"event_datetime": { "custom_func": {
37+
"name": "concat",
38+
"args": [
39+
{ "xpath": "event_date" },
40+
{ "const": "T" },
41+
{ "xpath": "event_time" }
42+
]
43+
}},
44+
```
45+
This transform, in English, takes the values of a child node `event_date`, a constant string `T` and a
46+
child node `event_time`, and returns them concatenated.
47+
48+
3. Chaining/Composability
49+
50+
Arguments of a `custom_func` transform can also be `custom_func`, thus enabling chaining and
51+
composability. Look at the following example (adapted from
52+
[here](../extensions/omniv21/samples/fixedlength/2_multi_rows.schema.json)):
53+
```
54+
"event_date_template": { "custom_func": {
55+
"name": "dateTimeToRFC3339",
56+
"args": [
57+
{ "custom_func": {
58+
"name": "concat",
59+
"args": [
60+
{ "xpath": "event_date" },
61+
{ "const": "T" },
62+
{ "xpath": "event_time" }
63+
]
64+
}},
65+
{ "xpath": "event_timezone", "_comment": "input timezone" },
66+
{ "const": "", "_comment": "output timezone" }
67+
]
68+
}}
69+
```
70+
This transform, in English, concatenates child nodes to produce a full event datetime string and then
71+
use `dateTimeToRFC3339` to normalize the datetime string into RFC3339 standard format.
72+
73+
There is no limit on how deep `custom_func` chaining can be.
74+
75+
4. `xpath`/`xpath_dynamic` Anchoring
76+
77+
Schema writer can also use `xpath` (or `xpath_dynamic`) to change current IDR tree cursor to make
78+
data extractions on arguments easier. Consider the same transform as above but imagine this time
79+
the all the event date time related fields are not at the current IDR cursor node, but rather in a
80+
child node `data`. Instead of writing each data extract `xpath` in the arguments as `"data/..."`, we
81+
can simply move the cursor to `data`, by specifying `xpath` on `custom_func` itself.
82+
```
83+
"event_date_template": { "xpath": "data", "custom_func": {
84+
"name": "dateTimeToRFC3339",
85+
"args": [
86+
{ "custom_func": {
87+
"name": "concat",
88+
"args": [
89+
{ "xpath": "event_date" },
90+
{ "const": "T" },
91+
{ "xpath": "event_time" }
92+
]
93+
}},
94+
{ "xpath": "event_timezone", "_comment": "input timezone" },
95+
{ "const": "", "_comment": "output timezone" }
96+
]
97+
}}
98+
```
99+
100+
## `javascript` and `javascript_with_context`
101+
102+
Omniparser has several basic `custom_func` like `lower`, `upper`, `dateTimeToRFC3339`, `uuidv3`, etc, among
103+
which the most important, flexible and powerful one is `javascript` (and its sibling
104+
`javascript_with_context`).
105+
106+
`javascript` is a `custom_func` transform that executes a JavaScript with optional input arguments.
107+
Omniparser uses https://github.com/dop251/goja, a native Golang ECMAScript implementation thus **free of
108+
external C/C++ lib dependencies**.
109+
110+
A simple example (adapted from [here](../extensions/omniv21/samples/csv/1_weather_data_csv.schema.json)):
111+
```
112+
"temp_in_f": { "custom_func": {
113+
"name": "javascript",
114+
"args": [
115+
{ "const": "Math.floor((temp_c * 9 / 5 + 32) * 10) / 10" },
116+
{ "const": "temp_c" }, { "xpath": ".", "type": "float" }
117+
]
118+
}}
119+
```
120+
This transform takes the value of the current IDR node, assuming temperature data in celsius, converts
121+
it to fahrenheit.
122+
123+
The first argument is typically a `const` transform that contains a javascript code. The rest of the
124+
arguments always come in pairs. In each pair, the first argument specify an input argument name, and the
125+
second specifies the value of the argument. Remember chaining is allowed for advanced composability.
126+
127+
The result type is whatever the type the script return value is, unless schema writer adds a `type` cast
128+
in the `custom_func` transform to force a type conversion.
129+
130+
If there is any exception thrown in the script, `javascript` transform will fail with an error. If the
131+
result from the script is `NaN`, `null`, `Infinity` or `Undefined`, the transform will fail with an error.
132+
133+
Another example (adapted from [here](../extensions/omniv21/samples/csv/1_weather_data_csv.schema.json)):
134+
```
135+
"uv_index": { "custom_func": {
136+
"name": "javascript",
137+
"args": [
138+
{ "const": "uv.split('/').map(function(s){return s.trim();}).filter(function(s){return !!s;})" },
139+
{ "const": "uv" }, { "xpath": "UV_INDEX" }
140+
]
141+
}},
142+
```
143+
where `UV_INDEX` column contains text like `"12/4/6"`.
144+
145+
The script above splits the input by `'/'`, trims away spaces, tosses out empty ones and returns it
146+
as an array, so the result for `"uv_index"` in the output JSON would look like this:
147+
```
148+
"uv_index": [
149+
"12",
150+
"4",
151+
"6"
152+
],
153+
```
154+
155+
So far the input arguments in the samples above are all of singular value. We can also support input
156+
argument of array, thus enabling aggregation (from
157+
[here](../extensions/omniv21/samples/json/2_multiple_objects.schema.json)):
158+
```
159+
"sum_price_times_10": { "custom_func": {
160+
"name": "javascript",
161+
"args": [
162+
{ "const": "t=0; for (i=0; i<prices.length; i++) { t+=prices[i]*10; } Math.floor(t*100)/100;" },
163+
{ "const": "prices" }, { "array": [ { "xpath": "books/*/price", "type": "float" } ] }
164+
]
165+
}},
166+
```
167+
Contrived, this transform takes all the price values from `"books/*/price"` XPath query, inflates each
168+
by 10 (why oh why?! :)), sums them all up, and returns the sum with 2 decimal places.
169+
170+
Input arguments to `javascript` function can be of simple primitive types (such as string, numbers, etc)
171+
but also objects or arrays, as illustrated above.
172+
173+
To provide ultimate freedom of parsing and transform, `javascript` has an even more powerful sibling
174+
function `javascript_with_context`. `javascript_with_context` is very similar to `javascript`, except that
175+
omniparser automatically injects the current IDR node and its sub-tree as a JSON object into the script
176+
under the global variable name `_node`, thus allowing the script to parse, and transform the current
177+
IDR node tree as it see fit. (You may ask why not just have `javascript` and auto-inject `_node`? It
178+
is because converting IDR node tree to JSON isn't exactly cheap and for vast majority cases, `_node`
179+
isn't needed so `javascript` is perfectly sufficient.)
180+
181+
Consider the following example:
182+
```
183+
"full_name": { "xpath": "./personal_info", "custom_func" {
184+
"name": "javascript_with_context",
185+
"args": [
186+
{ "const": "var n = JSON.parse(_node); n.['Last Name'] + ', ' + n.['First Name']" }
187+
]
188+
}}
189+
```
190+
assuming the current IDR context for this `"full_name"` transform is:
191+
```
192+
Node(Type: ElementNode)
193+
Node(Type: ElementNode, Data: "First Name")
194+
Node(Type: TextNode, Data: "John")
195+
Node(Type: ElementNode, Data: "Last Name")
196+
Node(Type: TextNode, Data: "Doe")
197+
Node(Type: ElementNode, Data: "Age")
198+
Node(Type: TextNode, Data: "35")
199+
```
200+
201+
When `javascript_with_context` is invoked, omniparser will convert the IDR tree above into a JSON object:
202+
```
203+
{
204+
"First Name": "John",
205+
"Last Name": "Doe",
206+
"Age": "35"
207+
}
208+
```
209+
thus allowing the script to parse the JSON object in and do something about it.
210+
211+
Theoretically, the entire `FINAL_OUTPUT` transform can be done with `javascript_with_context`. However,
212+
the cost/con of doing so or similarly "large-scale" `javascript_with_context` is 1) multiple round trips
213+
of serializing IDR into JSON then parsing JSON into javascript object and 2) it's just hard to write that
214+
much javascript in one line -- the current limitation of schema being strictly JSON which doesn't support
215+
multi-line string literals.
216+
217+
## Error Handling
218+
219+
If any of the argument tranforms return error, or the custom function itself fails, an error will be
220+
relayed out, unless `ignore_error` is specified.
221+
222+
Look at the following example (adapted from
223+
[here](../extensions/omniv21/samples/fixedlength/2_multi_rows.schema.json)):
224+
```
225+
"event_date_template": { "custom_func": {
226+
"name": "dateTimeToRFC3339",
227+
"args": [
228+
{ "custom_func": {
229+
"name": "concat",
230+
"args": [
231+
{ "xpath": "event_date" },
232+
{ "const": "T" },
233+
{ "xpath": "event_time" }
234+
]
235+
}},
236+
{ "xpath": "event_timezone", "_comment": "input timezone" },
237+
{ "const": "", "_comment": "output timezone" }
238+
],
239+
"ignore_error": true
240+
}}
241+
```
242+
243+
If say the `event_date` and `event_time` contain invalid characters, and `dateTimeToRFC3339` would
244+
typically fail to convert it to RFC3339 standard format, thus failing out the transform of
245+
`FINAL_OUTPUT` for the current record. However, because of `"ignore_error": true`, instead, this
246+
`custom_func` would simply return `nil/null` without error.
247+
248+
If an argument transform value is `nil/null` (possibly due to argument transform failure coupled with
249+
its own `"ignore_error": true`), then this argument's value will be whatever the default value of
250+
the argument type dictates, such as `0` for `int`, `0.0` for `float`, `""` for `string`, etc.

0 commit comments

Comments
 (0)