-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpptx.go
More file actions
303 lines (253 loc) · 6.92 KB
/
pptx.go
File metadata and controls
303 lines (253 loc) · 6.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
package docreader
import (
"archive/zip"
"encoding/xml"
"fmt"
"io"
"path/filepath"
"strings"
)
// PptxReader 用于读取 .pptx 文件
type PptxReader struct{}
// Slide 表示幻灯片的 XML 结构
type Slide struct {
XMLName xml.Name `xml:"sld"`
CommonSld struct {
ShapeTree struct {
Shapes []struct {
TextBody struct {
Paragraphs []struct {
Runs []struct {
Text string `xml:"t"`
} `xml:"r"`
} `xml:"p"`
} `xml:"txBody"`
} `xml:"sp"`
} `xml:"spTree"`
} `xml:"cSld"`
}
// PresentationProps 表示演示文稿属性
type PresentationProps struct {
XMLName xml.Name `xml:"coreProperties"`
Title string `xml:"title"`
Subject string `xml:"subject"`
Creator string `xml:"creator"`
Keywords string `xml:"keywords"`
Created string `xml:"created"`
Modified string `xml:"modified"`
}
// ReadText 读取 PPTX 文件的文本内容
func (r *PptxReader) ReadText(filePath string) (string, error) {
// 打开 zip 文件
zipReader, err := zip.OpenReader(filePath)
if err != nil {
return "", WrapError("PptxReader.ReadText", filePath, ErrFileOpen)
}
defer zipReader.Close()
var builder strings.Builder
slideNum := 1
// 遍历所有文件,查找幻灯片
for _, file := range zipReader.File {
// 检查是否是幻灯片文件
if strings.HasPrefix(file.Name, "ppt/slides/slide") && strings.HasSuffix(file.Name, ".xml") {
// 读取幻灯片内容
rc, err := file.Open()
if err != nil {
continue
}
slideXML, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
// 解析 XML
var slide Slide
if err := xml.Unmarshal(slideXML, &slide); err != nil {
continue
}
// 提取文本
builder.WriteString(fmt.Sprintf("\n=== 幻灯片 %d ===\n\n", slideNum))
for _, shape := range slide.CommonSld.ShapeTree.Shapes {
for _, para := range shape.TextBody.Paragraphs {
for _, run := range para.Runs {
builder.WriteString(run.Text)
}
builder.WriteString("\n")
}
}
slideNum++
}
}
if slideNum == 1 {
return "", WrapError("PptxReader.ReadText", filePath, ErrEmptyFile)
}
return builder.String(), nil
}
// GetMetadata 获取 PPTX 文件的元数据
func (r *PptxReader) GetMetadata(filePath string) (map[string]string, error) {
zipReader, err := zip.OpenReader(filePath)
if err != nil {
return nil, WrapError("PptxReader.GetMetadata", filePath, ErrFileOpen)
}
defer zipReader.Close()
metadata := make(map[string]string)
// 读取核心属性
for _, file := range zipReader.File {
if file.Name == "docProps/core.xml" {
rc, err := file.Open()
if err != nil {
continue
}
data, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
var props PresentationProps
if err := xml.Unmarshal(data, &props); err == nil {
metadata["title"] = props.Title
metadata["subject"] = props.Subject
metadata["creator"] = props.Creator
metadata["keywords"] = props.Keywords
metadata["created"] = props.Created
metadata["modified"] = props.Modified
}
break
}
}
// 统计幻灯片数量
slideCount := 0
for _, file := range zipReader.File {
if matched, _ := filepath.Match("ppt/slides/slide*.xml", file.Name); matched {
slideCount++
}
}
metadata["slide_count"] = fmt.Sprintf("%d", slideCount)
return metadata, nil
}
// GetSlides 获取所有幻灯片的文本内容(按幻灯片分组)
func (r *PptxReader) GetSlides(filePath string) ([]string, error) {
zipReader, err := zip.OpenReader(filePath)
if err != nil {
return nil, WrapError("PptxReader.GetSlides", filePath, ErrFileOpen)
}
defer zipReader.Close()
var slides []string
for _, file := range zipReader.File {
if strings.HasPrefix(file.Name, "ppt/slides/slide") && strings.HasSuffix(file.Name, ".xml") {
rc, err := file.Open()
if err != nil {
continue
}
slideXML, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
var slide Slide
if err := xml.Unmarshal(slideXML, &slide); err != nil {
continue
}
var builder strings.Builder
for _, shape := range slide.CommonSld.ShapeTree.Shapes {
for _, para := range shape.TextBody.Paragraphs {
for _, run := range para.Runs {
builder.WriteString(run.Text)
}
builder.WriteString("\n")
}
}
slides = append(slides, builder.String())
}
}
return slides, nil
}
// ReadWithConfig 根据配置读取 PPTX 文件,返回结构化结果
func (r *PptxReader) ReadWithConfig(filePath string, config *ReadConfig) (*DocumentResult, error) {
zipReader, err := zip.OpenReader(filePath)
if err != nil {
return nil, WrapError("PptxReader.ReadWithConfig", filePath, ErrFileOpen)
}
defer zipReader.Close()
// 先获取所有幻灯片
type slideData struct {
index int
content string
lines []string
}
allSlides := make([]slideData, 0)
for _, file := range zipReader.File {
if strings.HasPrefix(file.Name, "ppt/slides/slide") && strings.HasSuffix(file.Name, ".xml") {
rc, err := file.Open()
if err != nil {
continue
}
slideXML, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
var slide Slide
if err := xml.Unmarshal(slideXML, &slide); err != nil {
continue
}
lines := make([]string, 0)
for _, shape := range slide.CommonSld.ShapeTree.Shapes {
for _, para := range shape.TextBody.Paragraphs {
var lineBuilder strings.Builder
for _, run := range para.Runs {
lineBuilder.WriteString(run.Text)
}
line := lineBuilder.String()
if line != "" {
lines = append(lines, line)
}
}
}
allSlides = append(allSlides, slideData{
index: len(allSlides),
lines: lines,
content: strings.Join(lines, "\n"),
})
}
}
totalSlides := len(allSlides)
result := &DocumentResult{
FilePath: filePath,
TotalPages: totalSlides,
Pages: make([]PageContent, 0),
Metadata: make(map[string]string),
}
// 获取元数据
metadata, _ := r.GetMetadata(filePath)
result.Metadata = metadata
// 确定要读取的幻灯片和每页的行配置
pageLineMap := buildPageLineMap(config, totalSlides)
var contentBuilder strings.Builder
totalLines := 0
for slideIndex := 0; slideIndex < totalSlides; slideIndex++ {
lineConfig, shouldRead := pageLineMap[slideIndex]
if !shouldRead {
continue
}
slide := allSlides[slideIndex]
// 根据该页的配置筛选行
filteredLines := filterLinesForPage(slide.lines, lineConfig)
pageContent := PageContent{
PageNumber: slideIndex,
Lines: filteredLines,
TotalLines: len(filteredLines),
}
result.Pages = append(result.Pages, pageContent)
totalLines += len(filteredLines)
// 构建完整内容
contentBuilder.WriteString(fmt.Sprintf("\n=== 幻灯片 %d ===\n\n", slideIndex))
for _, line := range filteredLines {
contentBuilder.WriteString(line)
contentBuilder.WriteString("\n")
}
}
result.TotalLines = totalLines
result.Content = contentBuilder.String()
return result, nil
}