@@ -35,51 +35,105 @@ func GetReference(doc *html.Node) string {
3535 return utils .GetTextNode (refNode ).Data
3636}
3737
38- func ParseNodesForPassage (node * html.Node ) string {
39- var parts []string
40-
41- for child := node .FirstChild ; child != nil ; child = child .NextSibling {
42- if child .Type == html .TextNode {
43- parts = append (parts , child .Data )
44- } else if child .Type == html .ElementNode {
45- var subParts string
46- switch child .Data {
47- case "sup" :
48- isFootnote := func (node * html.Node ) bool {
49- for _ , attr := range node .Attr {
50- if attr .Key == "class" && attr .Val == "footnote" {
51- return true
52- }
53- }
54- return false
55- }
56- if isFootnote (child ) {
57- continue
58- }
59- childText := ParseNodesForPassage (child )
60- if len (childText ) > 0 {
61- subParts = fmt .Sprintf ("<b>%s</b>" , childText )
62- }
63- case "i" :
64- childText := ParseNodesForPassage (child )
65- subParts = fmt .Sprintf ("<i>%s</i>" , childText )
66- case "p" , "span" , "body" , "html" :
67- subParts = ParseNodesForPassage (child )
68- case "br" :
69- subParts = "\n "
70- default :
71- subParts = ParseNodesForPassage (child )
38+ // Helper function to escape characters for Telegram MarkdownV2
39+ func escapeMarkdownV2 (s string ) string {
40+ // According to Telegram API docs for MarkdownV2, characters to escape are:
41+ // '_', '*', '[', ']', '(', ')', '~', '`', '>', '#', '+', '-', '=', '|', '{', '}', '.', '!'
42+ // Note: '^' is not in this list. Let's assume it doesn't need escaping.
43+ // The logic should be to escape these characters *only* when they are not part of a formatting tag.
44+ // However, since we are processing raw text nodes, any special character should be escaped.
45+ r := strings .NewReplacer (
46+ "_" , `\_` , "*" , `\*` , "[" , `\[` , "]" , `\]` , "(" , `\(` , ")" , `\)` ,
47+ "~" , `\~` , "`" , "\\ `" , ">" , `\>` , "#" , `\#` , "+" , `\+` , "-" , `\-` ,
48+ "=" , `\=` , "|" , `\|` , "{" , `\{` , "}" , `\}` , "." , `\.` , "!" , `\!` ,
49+ )
50+ return r .Replace (s )
51+ }
52+
53+ // Helper functions for parsing
54+ func isFormattingTag (tag string ) bool {
55+ return tag == "sup" || tag == "i" || tag == "b"
56+ }
57+
58+ func isHeaderTag (tag string ) bool {
59+ return tag == "h1" || tag == "h2" || tag == "h3" || tag == "h4"
60+ }
61+
62+ func wrapText (text , tag string ) string {
63+ if strings .TrimSpace (text ) == "" {
64+ return text
65+ }
66+
67+ if tag == "sup" {
68+ // User-specified format for superscript
69+ return fmt .Sprintf ("^%s^" , strings .Trim (text , " " ))
70+ }
71+ if tag == "i" {
72+ return fmt .Sprintf ("_%s_" , text )
73+ }
74+ if tag == "b" || isHeaderTag (tag ) {
75+ return fmt .Sprintf ("*%s*" , text )
76+ }
77+ return text
78+ }
79+
80+ func parseNode (node * html.Node ) string {
81+ if node .Type == html .TextNode {
82+ return escapeMarkdownV2 (node .Data )
83+ }
84+
85+ if node .Type != html .ElementNode {
86+ var content strings.Builder
87+ for c := node .FirstChild ; c != nil ; c = c .NextSibling {
88+ content .WriteString (parseNode (c ))
89+ }
90+ return content .String ()
91+ }
92+
93+ tag := node .Data
94+
95+ // Handle non-formatting tags first
96+ if tag == "br" {
97+ return "\n "
98+ }
99+ if ! isFormattingTag (tag ) && ! isHeaderTag (tag ) {
100+ var content strings.Builder
101+ for c := node .FirstChild ; c != nil ; c = c .NextSibling {
102+ content .WriteString (parseNode (c ))
103+ }
104+ return content .String ()
105+ }
106+
107+ // Handle formatting tags (b, i, sup, h1-h4)
108+ if tag == "sup" {
109+ for _ , attr := range node .Attr {
110+ if attr .Key == "class" && attr .Val == "footnote" {
111+ return "" // Ignore footnote nodes
72112 }
73- parts = append (parts , subParts )
74113 }
75114 }
76115
77- text := strings .Join (parts , "" )
116+ var content strings.Builder
117+ var textBuffer strings.Builder
118+
119+ flushTextBuffer := func () {
120+ if textBuffer .Len () > 0 {
121+ content .WriteString (wrapText (textBuffer .String (), tag ))
122+ textBuffer .Reset ()
123+ }
124+ }
78125
79- if node .Data == "h1" || node .Data == "h2" || node .Data == "h3" || node .Data == "h4" {
80- text = fmt .Sprintf ("<b>%s</b>" , text )
126+ for c := node .FirstChild ; c != nil ; c = c .NextSibling {
127+ if c .Type == html .ElementNode && (isFormattingTag (c .Data ) || isHeaderTag (c .Data )) {
128+ flushTextBuffer ()
129+ content .WriteString (parseNode (c ))
130+ } else {
131+ textBuffer .WriteString (parseNode (c ))
132+ }
81133 }
82- return text
134+ flushTextBuffer ()
135+
136+ return content .String ()
83137}
84138
85139func ParsePassageFromHtml (rawHtml string ) string {
@@ -88,8 +142,7 @@ func ParsePassageFromHtml(rawHtml string) string {
88142 log .Printf ("Error parsing html: %v" , err )
89143 return rawHtml
90144 }
91-
92- return ParseNodesForPassage (doc )
145+ return parseNode (doc )
93146}
94147
95148// Deprecated: Using new API service
@@ -119,7 +172,7 @@ func GetPassage(ref string, doc *html.Node, version string) string {
119172 return false
120173 })
121174
122- textBlocks := utils .MapNodeListToString (filtNodes , ParseNodesForPassage )
175+ textBlocks := utils .MapNodeListToString (filtNodes , parseNode )
123176
124177 var passage strings.Builder
125178
@@ -173,6 +226,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData {
173226}
174227
175228// Deprecated: Using new API service logic inside GetBiblePassage
229+ // Deprecated: Using new API service
176230func CheckBibleReference (ref string ) bool {
177231 log .Printf ("Checking reference %s" , ref )
178232
0 commit comments