77 "fmt"
88 "log"
99 "net/url"
10+ "regexp"
1011 "strings"
1112 stdhtml "html"
1213
@@ -59,6 +60,19 @@ func isNextSiblingBr(node *html.Node) bool {
5960 return false
6061}
6162
63+ func hasNextSignificantSibling (node * html.Node ) bool {
64+ for next := node .NextSibling ; next != nil ; next = next .NextSibling {
65+ if next .Type == html .TextNode {
66+ if len (strings .TrimSpace (next .Data )) == 0 {
67+ continue
68+ }
69+ return true
70+ }
71+ return true // Any element
72+ }
73+ return false
74+ }
75+
6276func ParseNodesForPassage (node * html.Node ) string {
6377 var parts []string
6478
@@ -114,7 +128,7 @@ func ParseNodesForPassage(node *html.Node) string {
114128 if headerText == "Footnotes" || headerText == "Cross references" {
115129 continue
116130 }
117- parts = append (parts , fmt .Sprintf ("\n \n <b>%s</b>\n " , headerText ))
131+ parts = append (parts , fmt .Sprintf ("\n \n <b>%s</b>\n " , strings . TrimSpace ( headerText ) ))
118132 case "ul" , "ol" :
119133 parts = append (parts , ParseNodesForPassage (child ))
120134 case "li" :
@@ -136,11 +150,21 @@ func ParseNodesForPassage(node *html.Node) string {
136150 return strings .Join (parts , "" )
137151}
138152
153+ // Collapse multiple newlines (potentially with spaces in between) to max 2 newlines
154+ // \n\s*\n\s*\n+ -> \n\n
155+ var newlineRegex = regexp .MustCompile (`\n\s*\n[\s\n]*` )
156+
157+ func CleanPassageText (text string ) string {
158+ text = newlineRegex .ReplaceAllString (text , "\n \n " )
159+ return strings .TrimSpace (text )
160+ }
161+
139162func GetPassage (ref string , doc * html.Node , version string ) string {
140163 // Replaced FilterTree with direct parsing of the root node
141164 // This allows handling arbitrary structure (divs, lists) returned by the API
142165
143166 text := ParseNodesForPassage (doc )
167+ text = CleanPassageText (text )
144168
145169 var passage strings.Builder
146170
@@ -151,7 +175,7 @@ func GetPassage(ref string, doc *html.Node, version string) string {
151175 }
152176
153177 passage .WriteString ("\n " )
154- passage .WriteString (strings . TrimSpace ( text ) )
178+ passage .WriteString (text )
155179
156180 return passage .String ()
157181}
0 commit comments