@@ -14,9 +14,10 @@ import (
1414 "github.com/roboco-io/hwp2markdown/internal/llm/gemini"
1515 "github.com/roboco-io/hwp2markdown/internal/llm/ollama"
1616 "github.com/roboco-io/hwp2markdown/internal/llm/openai"
17- "github.com/roboco-io/hwp2markdown/internal/llm/upstage"
17+ llmupstage "github.com/roboco-io/hwp2markdown/internal/llm/upstage"
1818 "github.com/roboco-io/hwp2markdown/internal/parser"
1919 "github.com/roboco-io/hwp2markdown/internal/parser/hwpx"
20+ parserupstage "github.com/roboco-io/hwp2markdown/internal/parser/upstage"
2021 "github.com/spf13/cobra"
2122)
2223
2627 convertProvider string
2728 convertModel string
2829 convertBaseURL string
30+ convertParser string
2931 convertExtractImgs bool
3032 convertImagesDir string
3133 convertVerbose bool
@@ -42,6 +44,7 @@ var convertCmd = &cobra.Command{
4244더 자연스러운 Markdown을 생성할 수 있습니다.
4345
4446환경 변수:
47+ HWP2MD_PARSER=xxx 파서 선택 (native, upstage)
4548 HWP2MD_LLM=true Stage 2 활성화
4649 HWP2MD_MODEL=xxx 모델 이름 (프로바이더 자동 감지)
4750 HWP2MD_BASE_URL=xxx 프라이빗 API 엔드포인트 (Bedrock, 로컬 서버 등)
@@ -58,9 +61,14 @@ var convertCmd = &cobra.Command{
5861 --base-url http://localhost:8080 # 로컬 서버
5962 --base-url https://your-azure-endpoint.openai.azure.com # Azure OpenAI
6063
64+ 파서 선택:
65+ --parser=native 내장 파서 사용 (기본)
66+ --parser=upstage Upstage Document Parse API 사용 (UPSTAGE_API_KEY 필요)
67+
6168예시:
6269 hwp2markdown convert document.hwpx
6370 hwp2markdown convert document.hwpx -o output.md
71+ hwp2markdown convert document.hwpx --parser upstage
6472 hwp2markdown convert document.hwpx --llm
6573 hwp2markdown convert document.hwpx --llm --model gpt-4o
6674 hwp2markdown convert document.hwpx --llm --model solar-pro
@@ -76,6 +84,7 @@ func init() {
7684 convertCmd .Flags ().StringVar (& convertProvider , "provider" , "" , "LLM 프로바이더 (openai, anthropic, gemini, upstage, ollama)" )
7785 convertCmd .Flags ().StringVar (& convertModel , "model" , "" , "LLM 모델 이름" )
7886 convertCmd .Flags ().StringVar (& convertBaseURL , "base-url" , "" , "프라이빗 API 엔드포인트 (Bedrock, Azure, 로컬 서버 등)" )
87+ convertCmd .Flags ().StringVar (& convertParser , "parser" , "" , "파서 선택 (native, upstage)" )
7988 convertCmd .Flags ().BoolVar (& convertExtractImgs , "extract-images" , false , "이미지 추출 활성화" )
8089 convertCmd .Flags ().StringVar (& convertImagesDir , "images-dir" , "./images" , "추출된 이미지 저장 디렉토리" )
8190 convertCmd .Flags ().BoolVarP (& convertVerbose , "verbose" , "v" , false , "상세 출력" )
@@ -103,8 +112,21 @@ func runConvert(cmd *cobra.Command, args []string) error {
103112 fmt .Fprintf (cmd .ErrOrStderr (), "파일 형식: %s\n " , format )
104113 }
105114
115+ // Determine parser type (from flag or env)
116+ parserType := convertParser
117+ if parserType == "" {
118+ parserType = os .Getenv ("HWP2MD_PARSER" )
119+ }
120+ if parserType == "" {
121+ parserType = "native"
122+ }
123+
124+ if ! convertQuiet && convertVerbose {
125+ fmt .Fprintf (cmd .ErrOrStderr (), "파서: %s\n " , parserType )
126+ }
127+
106128 // Parse document (Stage 1)
107- doc , err := parseDocumentForConvert (inputPath , format )
129+ doc , err := parseDocumentForConvert (cmd , inputPath , format , parserType )
108130 if err != nil {
109131 return fmt .Errorf ("문서 파싱 실패: %w" , err )
110132 }
@@ -153,7 +175,23 @@ func runConvert(cmd *cobra.Command, args []string) error {
153175 return nil
154176}
155177
156- func parseDocumentForConvert (path string , format parser.Format ) (* ir.Document , error ) {
178+ func parseDocumentForConvert (cmd * cobra.Command , path string , format parser.Format , parserType string ) (* ir.Document , error ) {
179+ // Use Upstage Document Parse API if selected
180+ if parserType == "upstage" {
181+ upstageParser , err := parserupstage .New (parserupstage.Config {})
182+ if err != nil {
183+ return nil , fmt .Errorf ("Upstage 파서 초기화 실패: %w" , err )
184+ }
185+
186+ if ! convertQuiet {
187+ fmt .Fprintf (cmd .ErrOrStderr (), "Upstage Document Parse API 사용 중...\n " )
188+ }
189+
190+ ctx := context .Background ()
191+ return upstageParser .Parse (ctx , path )
192+ }
193+
194+ // Native parser
157195 opts := parser.Options {
158196 ExtractImages : convertExtractImgs ,
159197 ImageDir : convertImagesDir ,
@@ -169,7 +207,8 @@ func parseDocumentForConvert(path string, format parser.Format) (*ir.Document, e
169207 return p .Parse ()
170208
171209 case parser .FormatHWP :
172- return nil , fmt .Errorf ("HWP 5.x 형식은 아직 지원하지 않습니다" )
210+ // Native parser doesn't support HWP, suggest using Upstage
211+ return nil , fmt .Errorf ("HWP 5.x 형식은 내장 파서에서 지원하지 않습니다. --parser=upstage 옵션을 사용하세요" )
173212
174213 default :
175214 return nil , fmt .Errorf ("알 수 없는 형식: %s" , format )
@@ -239,7 +278,7 @@ func formatWithLLM(cmd *cobra.Command, doc *ir.Document) (string, *llm.FormatRes
239278 Model : model ,
240279 })
241280 case "upstage" :
242- provider , err = upstage .New (upstage .Config {
281+ provider , err = llmupstage .New (llmupstage .Config {
243282 Model : model ,
244283 BaseURL : baseURL ,
245284 })
@@ -270,6 +309,24 @@ func formatWithLLM(cmd *cobra.Command, doc *ir.Document) (string, *llm.FormatRes
270309}
271310
272311func convertToBasicMarkdown (doc * ir.Document ) string {
312+ // If RawMarkdown is available (e.g., from Upstage parser), use it directly
313+ if doc .RawMarkdown != "" {
314+ var sb strings.Builder
315+ // Add front matter if metadata exists
316+ if doc .Metadata .Title != "" || doc .Metadata .Author != "" {
317+ sb .WriteString ("---\n " )
318+ if doc .Metadata .Title != "" {
319+ sb .WriteString (fmt .Sprintf ("title: %s\n " , doc .Metadata .Title ))
320+ }
321+ if doc .Metadata .Author != "" {
322+ sb .WriteString (fmt .Sprintf ("author: %s\n " , doc .Metadata .Author ))
323+ }
324+ sb .WriteString ("---\n \n " )
325+ }
326+ sb .WriteString (doc .RawMarkdown )
327+ return sb .String ()
328+ }
329+
273330 var sb strings.Builder
274331
275332 // Metadata as YAML front matter (optional)
0 commit comments