Skip to content

Commit 1fd2460

Browse files
authored
fixed-length reader.readLine and reader.readByRowsEnvelope (#115)
fixed-length reader.readLine and reader.readByRowsEnvelope
1 parent 6bcd509 commit 1fd2460

File tree

6 files changed

+289
-17
lines changed

6 files changed

+289
-17
lines changed

extensions/omniv21/fileformat/fixedlength/decl.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package fixedlength
22

33
import (
44
"fmt"
5+
"unicode/utf8"
56

67
"github.com/jf-tech/go-corelib/caches"
78
)
@@ -27,18 +28,24 @@ func (c *columnDecl) lineMatch(line []byte) bool {
2728
return r.Match(line)
2829
}
2930

30-
func (c *columnDecl) lineToColumn(line []rune) []rune {
31+
func (c *columnDecl) lineToColumnValue(line []byte) string {
3132
// StartPos is 1-based and its value >= 1 guaranteed by json schema validation done earlier.
32-
startPosZeroBased := c.StartPos - 1
33-
// If [startPosZeroBased, c.Length] is partially out of range, we'll return whatever is
34-
// in range; if [startPosZeroBased, c.Length] is fully out of range, we'll return "".
35-
switch {
36-
case startPosZeroBased+c.Length <= len(line):
37-
return line[startPosZeroBased : startPosZeroBased+c.Length]
38-
case startPosZeroBased < len(line):
39-
return line[startPosZeroBased:]
33+
start := c.StartPos - 1
34+
// First chop off the prefix prior to c.StartPos
35+
for start > 0 && len(line) > 0 {
36+
_, adv := utf8.DecodeRune(line)
37+
line = line[adv:]
38+
start--
4039
}
41-
return nil
40+
// Then from that position, count c.Length runes and that's the string value we need.
41+
lenCount := c.Length
42+
i := 0
43+
for lenCount > 0 && i < len(line) {
44+
_, adv := utf8.DecodeRune(line[i:])
45+
i += adv
46+
lenCount--
47+
}
48+
return string(line[:i])
4249
}
4350

4451
type envelopeDecl struct {

extensions/omniv21/fileformat/fixedlength/decl_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ func TestColumnDecl_LineMatch(t *testing.T) {
1414
assert.True(t, (&columnDecl{LinePattern: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("ABCDEFG")))
1515
}
1616

17-
func TestColumnDecl_LineToColumn(t *testing.T) {
17+
func TestColumnDecl_LineToColumnValue(t *testing.T) {
1818
decl := func(start, length int) *columnDecl {
1919
return &columnDecl{StartPos: start, Length: length}
2020
}
21-
assert.Nil(t, decl(10, 4).lineToColumn([]rune("test"))) // fully out of range
22-
assert.Equal(t, []rune("st"), decl(3, 4).lineToColumn([]rune("test"))) // partially out of range
23-
assert.Equal(t, []rune("tes"), decl(1, 3).lineToColumn([]rune("test"))) // fully in range
21+
assert.Equal(t, "", decl(10, 4).lineToColumnValue([]byte("test"))) // fully out of range
22+
assert.Equal(t, "st", decl(3, 4).lineToColumnValue([]byte("test"))) // partially out of range
23+
assert.Equal(t, "tes", decl(1, 3).lineToColumnValue([]byte("test"))) // fully in range
2424
}
2525

2626
func TestEnvelopeDecl_ByRows(t *testing.T) {
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package fixedlength
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"io"
7+
8+
"github.com/antchfx/xpath"
9+
"github.com/jf-tech/go-corelib/ios"
10+
11+
"github.com/jf-tech/omniparser/idr"
12+
)
13+
14+
type ErrInvalidEnvelope string
15+
16+
func (e ErrInvalidEnvelope) Error() string { return string(e) }
17+
18+
func IsErrInvalidEnvelope(err error) bool {
19+
switch err.(type) {
20+
case ErrInvalidEnvelope:
21+
return true
22+
default:
23+
return false
24+
}
25+
}
26+
27+
type reader struct {
28+
inputName string
29+
r *bufio.Reader
30+
decl *fileDecl
31+
xpath *xpath.Expr
32+
root *idr.Node
33+
target *idr.Node
34+
envelopeIndex int
35+
line int // 1-based
36+
}
37+
38+
// Note the returned []byte is only valid before the next readLine() call.
39+
func (r *reader) readLine() ([]byte, error) {
40+
for {
41+
line, err := ios.ByteReadLine(r.r)
42+
switch err {
43+
case nil:
44+
r.line++
45+
case io.EOF:
46+
return nil, err
47+
default:
48+
r.line++
49+
return nil, err
50+
}
51+
// skip only truly empty lines.
52+
if len(line) == 0 {
53+
continue
54+
}
55+
return line, nil
56+
}
57+
}
58+
59+
func (r *reader) readByRowsEnvelope() (*idr.Node, error) {
60+
envelopeDecl := r.decl.Envelopes[r.envelopeIndex]
61+
node := idr.CreateNode(idr.ElementNode, *envelopeDecl.Name)
62+
columnsDone := make([]bool, len(envelopeDecl.Columns))
63+
for i := 0; i < envelopeDecl.byRows(); i++ {
64+
line, err := r.readLine()
65+
if err != nil {
66+
if err == io.EOF && i == 0 {
67+
return nil, err
68+
}
69+
return nil, ErrInvalidEnvelope(
70+
r.fmtErrStr("incomplete envelope, missing %d row(s)", envelopeDecl.byRows()-i))
71+
}
72+
for col := range envelopeDecl.Columns {
73+
if columnsDone[col] {
74+
continue
75+
}
76+
colDecl := envelopeDecl.Columns[col]
77+
if !colDecl.lineMatch(line) {
78+
continue
79+
}
80+
colNode := idr.CreateNode(idr.ElementNode, colDecl.Name)
81+
idr.AddChild(node, colNode)
82+
colVal := idr.CreateNode(idr.TextNode, colDecl.lineToColumnValue(line))
83+
idr.AddChild(colNode, colVal)
84+
columnsDone[col] = true
85+
}
86+
}
87+
return node, nil
88+
}
89+
90+
func (r *reader) fmtErrStr(format string, args ...interface{}) string {
91+
return fmt.Sprintf("input '%s' line %d: %s", r.inputName, r.line, fmt.Sprintf(format, args...))
92+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package fixedlength
2+
3+
import (
4+
"bufio"
5+
"errors"
6+
"io"
7+
"strings"
8+
"testing"
9+
10+
"github.com/jf-tech/go-corelib/strs"
11+
"github.com/jf-tech/go-corelib/testlib"
12+
"github.com/stretchr/testify/assert"
13+
14+
"github.com/jf-tech/omniparser/idr"
15+
)
16+
17+
func TestIsErrInvalidEnvelope(t *testing.T) {
18+
assert.True(t, IsErrInvalidEnvelope(ErrInvalidEnvelope("test")))
19+
assert.Equal(t, "test", ErrInvalidEnvelope("test").Error())
20+
assert.False(t, IsErrInvalidEnvelope(errors.New("test")))
21+
}
22+
23+
func testReader(r io.Reader, decl *fileDecl) *reader {
24+
return &reader{
25+
inputName: "test",
26+
r: bufio.NewReader(r),
27+
decl: decl,
28+
line: 1,
29+
}
30+
}
31+
32+
func TestReadLine(t *testing.T) {
33+
r := testReader(strings.NewReader("abc\n\nefg\n \nxyz\n"), nil)
34+
assert.Equal(t, 1, r.line)
35+
36+
line, err := r.readLine()
37+
assert.NoError(t, err)
38+
assert.Equal(t, []byte("abc"), line)
39+
assert.Equal(t, 2, r.line)
40+
41+
// the second read will skip a truly empty line.
42+
line, err = r.readLine()
43+
assert.NoError(t, err)
44+
assert.Equal(t, []byte("efg"), line)
45+
assert.Equal(t, 4, r.line)
46+
47+
// next line is not truly empty, it contains just spaces, we need to read it in.
48+
line, err = r.readLine()
49+
assert.NoError(t, err)
50+
assert.Equal(t, []byte(" "), line)
51+
assert.Equal(t, 5, r.line)
52+
53+
line, err = r.readLine()
54+
assert.NoError(t, err)
55+
assert.Equal(t, []byte("xyz"), line)
56+
assert.Equal(t, 6, r.line)
57+
58+
// io.EOF shouldn't bump up current line number.
59+
line, err = r.readLine()
60+
assert.Equal(t, io.EOF, err)
61+
assert.Equal(t, 6, r.line)
62+
63+
// reading again should still return io.EOF and line number stays.
64+
line, err = r.readLine()
65+
assert.Equal(t, io.EOF, err)
66+
assert.Equal(t, 6, r.line)
67+
68+
// Another scenario that io.Reader fails
69+
r = testReader(testlib.NewMockReadCloser("read error", nil), nil)
70+
assert.Equal(t, 1, r.line)
71+
line, err = r.readLine()
72+
assert.Error(t, err)
73+
assert.Equal(t, "read error", err.Error())
74+
assert.Nil(t, line)
75+
// reading error (unless it's EOF) bumps current line number
76+
assert.Equal(t, 2, r.line)
77+
}
78+
79+
func TestReadByRowsEnvelope_ByRowsDefault(t *testing.T) {
80+
// default by_rows = 1
81+
r := testReader(strings.NewReader("abc\n\nefghijklmn\n \nxyz\n"),
82+
&fileDecl{Envelopes: []*envelopeDecl{{
83+
Name: strs.StrPtr("env1"),
84+
Columns: []*columnDecl{
85+
{
86+
Name: "col1",
87+
StartPos: 2,
88+
Length: 4,
89+
},
90+
},
91+
}}})
92+
93+
n, err := r.readByRowsEnvelope()
94+
assert.NoError(t, err)
95+
assert.Equal(t, `{"col1":"bc"}`, idr.JSONify2(n))
96+
assert.Equal(t, 2, r.line)
97+
98+
n, err = r.readByRowsEnvelope()
99+
assert.NoError(t, err)
100+
assert.Equal(t, `{"col1":"fghi"}`, idr.JSONify2(n))
101+
assert.Equal(t, 4, r.line)
102+
103+
n, err = r.readByRowsEnvelope()
104+
assert.NoError(t, err)
105+
assert.Equal(t, `{"col1":" "}`, idr.JSONify2(n))
106+
assert.Equal(t, 5, r.line)
107+
108+
n, err = r.readByRowsEnvelope()
109+
assert.NoError(t, err)
110+
assert.Equal(t, `{"col1":"yz"}`, idr.JSONify2(n))
111+
assert.Equal(t, 6, r.line)
112+
113+
n, err = r.readByRowsEnvelope()
114+
assert.Equal(t, io.EOF, err)
115+
assert.Nil(t, n)
116+
}
117+
118+
func TestReadByRowsEnvelope_ByRowsNonDefault(t *testing.T) {
119+
r := testReader(strings.NewReader("abcdefg\n\nhijklmn\n \nabc012345\n"),
120+
&fileDecl{Envelopes: []*envelopeDecl{{
121+
Name: strs.StrPtr("env1"),
122+
ByRows: testlib.IntPtr(3),
123+
Columns: []*columnDecl{
124+
{Name: "col1", StartPos: 2, Length: 4, LinePattern: strs.StrPtr("^abc")},
125+
{Name: "col2", StartPos: 2, Length: 4, LinePattern: strs.StrPtr("^hij")},
126+
{Name: "col3", StartPos: 3, Length: 5, LinePattern: strs.StrPtr("^abc")},
127+
},
128+
}}})
129+
130+
n, err := r.readByRowsEnvelope()
131+
assert.NoError(t, err)
132+
assert.Equal(t, `{"col1":"bcde","col2":"ijkl","col3":"cdefg"}`, idr.JSONify2(n))
133+
134+
n, err = r.readByRowsEnvelope()
135+
assert.Error(t, err)
136+
assert.Equal(t, "input 'test' line 6: incomplete envelope, missing 2 row(s)", err.Error())
137+
assert.Nil(t, n)
138+
}
139+
140+
var (
141+
benchReadByRowsEnvelopeInput = strings.Repeat(
142+
"abcdefghijklmnopqrstuvwxyz\n \n012345678901234567890123456789\n", 1000)
143+
benchReadByRowsEnvelopeDecl = &fileDecl{
144+
Envelopes: []*envelopeDecl{
145+
{
146+
Name: strs.StrPtr("env1"),
147+
ByRows: testlib.IntPtr(3),
148+
Columns: []*columnDecl{
149+
{Name: "col1", StartPos: 2, Length: 10, LinePattern: strs.StrPtr("^abc")},
150+
{Name: "col2", StartPos: 2, Length: 10, LinePattern: strs.StrPtr("^0123")},
151+
{Name: "col3", StartPos: 12, Length: 19, LinePattern: strs.StrPtr("^abc")},
152+
},
153+
},
154+
},
155+
}
156+
)
157+
158+
// BenchmarkReadByRowsEnvelope-8 624 1891740 ns/op 133140 B/op 9005 allocs/op
159+
func BenchmarkReadByRowsEnvelope(b *testing.B) {
160+
for i := 0; i < b.N; i++ {
161+
r := testReader(strings.NewReader(benchReadByRowsEnvelopeInput), benchReadByRowsEnvelopeDecl)
162+
for {
163+
n, err := r.readByRowsEnvelope()
164+
if err != nil {
165+
if err == io.EOF {
166+
break
167+
}
168+
b.FailNow()
169+
}
170+
idr.RemoveAndReleaseTree(n)
171+
}
172+
}
173+
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ require (
1111
github.com/go-chi/chi v4.1.2+incompatible
1212
github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect
1313
github.com/google/uuid v1.1.2
14-
github.com/jf-tech/go-corelib v0.0.11
14+
github.com/jf-tech/go-corelib v0.0.13
1515
github.com/spf13/cobra v1.0.0
1616
github.com/spf13/pflag v1.0.5 // indirect
1717
github.com/stretchr/testify v1.6.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uG
6262
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
6363
github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=
6464
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
65-
github.com/jf-tech/go-corelib v0.0.11 h1:p5bBM1vf9O2Q//f9q2UyfpWdFUG93X/tgsmKFj/klaU=
66-
github.com/jf-tech/go-corelib v0.0.11/go.mod h1:0+Fejzd53JtexKE5VI8I06WiBNATLIURRJgPrv4Yysg=
65+
github.com/jf-tech/go-corelib v0.0.13 h1:4aDwS09bdfRb/loU6Va+kvqwNJq5irR5WjKVpzJxdag=
66+
github.com/jf-tech/go-corelib v0.0.13/go.mod h1:0+Fejzd53JtexKE5VI8I06WiBNATLIURRJgPrv4Yysg=
6767
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
6868
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
6969
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=

0 commit comments

Comments
 (0)