Skip to content

调用转换方法将PDF文件转换为HTML内容报错Stack overflow #6

@wtujvk

Description

@wtujvk

1、环境:使用的.Net8,跨平台应用Web网站。使用的组件版本
2、windows下使用iis部署,转换PDF文件为HTML字符串是成功的。
linux下,有的PDF文件导入是成功的,有的不行。
同一个PDF文件,在linux下(Rocky10)转换报错:
Stack overflow.
at System.String.Intern()
at spr毗.⨫(System.Collections.ArrayList, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.⻖(System.String)
at spr팷.⨫(spr毗, System.String, Boolean)
at spr팷.⨫(System.String, spr毗, System.String, Boolean)
at spr팷.⨫(spr℺, System.String)
at spr팷.⳸(spr℺)
at spr팷.⨫(Boolean)
at spr팷.㫹()
at spr㚳.⨫(Int32)
at spr㞢.⨫(Spire.Pdf.PdfDocumentBase, System.String, Boolean, Int32, Int32)
at spr㗄.⨫(Spire.Pdf.PdfDocumentBase, System.IO.Stream, Int32, Int32)
at Spire.Pdf.PdfDocumentBase.ⷧ(Int32, Int32, System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.SaveToHtml(System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.Save(System.IO.Stream, Spire.Pdf.FileFormat)
at Spire.Pdf.PdfDocument.SaveToStream(System.IO.Stream, Spire.Pdf.FileFormat)
at PowerEasy.Modules.ContentManage.Business.SpirePdfImportWordLogic.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)
at PowerEasy.Modules.ContentManage.Business.WordImportLogicWrapper.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)

还有一处错误(另一个版本8.x或9.x?):
Stack overflow.
at System.SpanHelpers.IndexOf(Char ByRef, Int32, Char ByRef, Int32)
at System.String.Replace(System.String, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)

应用程序中调用的方法代码(C#):
`SpirePdfImportWordLogic.cs类:
private static string[] pictureExtensions = new string[] { ".jpg", ".jpeg", ".bmp", ".png", ".gif", ".tiff", ".svg" };
private static Regex titlePattern = new Regex(@"<p\sclass=""((heading|标题)-+\d|(标题|Title|Heading))""?.>?.

", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static Regex titlePattern2 = new Regex(@"(heading|标题)-+\d", RegexOptions.IgnoreCase | RegexOptions.Compiled);

    // 正则表达式模式,匹配所有 base64 编码字符串
    private static Regex base64Pattern = new Regex(@"data:\w+/?.*;base64,(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?", RegexOptions.IgnoreCase | RegexOptions.Compiled);   

public string ConvertToHtml(PowerHttpFile file, bool removeImage = false)
{
using (var doc = new PdfDocument())
using (var memoryStream = new MemoryStream())
{
string htmlResult = string.Empty;

            doc.LoadFromStream(file.Stream);
            file.Stream.Close();

            doc.ConvertOptions.PdfToHtmlHorizontalAlignmentCenter = true;
            doc.ConvertOptions.SetPdfToHtmlOptions(true, true, 1, false);
            doc.SaveToStream(memoryStream, FileFormat.HTML);
            memoryStream.Seek(0, SeekOrigin.Begin);
            using var streamReader = new StreamReader(memoryStream);
            htmlResult = streamReader.ReadToEnd();

            if (!string.IsNullOrWhiteSpace(htmlResult))
            {
                var sb = new StringBuilder(htmlResult);
                var tagBuilder = new StringBuilder(1024);
                foreach (Match match in base64Pattern.Matches(htmlResult))
                {
                    // base64字符串类似:data:application/x-font-woff;base64,d09GRgABAAAAAC9oAA0A==
                    var replaceInfo = this.SaveBase64Data(match.Value).ConfigureAwait(false).GetAwaiter().GetResult();
                    if (replaceInfo == null)
                    {
                        continue;
                    }

                    if (replaceInfo.Succeed)
                    {
                        sb.Replace(match.Value, replaceInfo.ImagePath);
                    }
                }

                htmlResult = sb.ToString();

                // 每份解析后的html文档的body下都有一对div包含内容,所以对有用内容进行截取。
                var firstIndex = htmlResult.IndexOf("<body", StringComparison.Ordinal);
                var lastIndex = htmlResult.LastIndexOf("</body>", StringComparison.Ordinal);
                if (firstIndex > -1 && lastIndex > -1)
                {
                    htmlResult = htmlResult.Substring(firstIndex, lastIndex - firstIndex + 6);
                }

                htmlResult = this.ParseDocTitleToHtmlTitle(htmlResult);

                // 清除 className 和
                htmlResult = this.ClearDocContent(htmlResult);
            }

            return this.HandleEnterKey(htmlResult);
        }
    }

    private async Task<ReplaceImageInfo> SaveBase64Data(string base64Data)
    {
        /*
         * 能处理的 base64 字符串类似:data:application/x-font-woff;base64,d09GRgABAAAAAC9oAA0A==
         */
        var splitKey = ";base64,";

        if (string.IsNullOrWhiteSpace(base64Data) || !base64Data.Contains(splitKey))
        {
            return null;
        }

        try
        {
            var base64Info = base64Data.Split(splitKey);

            // 跳过字体处理
            if (base64Info[0].Contains("x-font-woff"))
            {
                return null;
            }

            byte[] bytes = Convert.FromBase64String(base64Info[1]);

            // 占位符
            var replaceInfo = new ReplaceImageInfo();

            // 获取文件类型
            var extension = UploadFileContentTypeSecurityHelper.GetExtensionByBytes(bytes, pictureExtensions).FirstOrDefault();
            if (string.IsNullOrWhiteSpace(extension))
            {
                replaceInfo.Alt = "不支持的文件格式";
                replaceInfo.FileName = "不支持的文件格式";
                return replaceInfo;
            }

            var contentType = UploadFileContentTypeSecurityHelper.FileContentTypes.ContainsKey(extension) ?
                UploadFileContentTypeSecurityHelper.FileContentTypes[extension].FirstOrDefault() :
                base64Info[0].Replace("data:", string.Empty);
            if (string.IsNullOrWhiteSpace(extension))
            {
                extension = "jpg";
            }
            else
            {
                // 移除 .
                extension = extension.Replace(".", string.Empty);
            }

            using Stream stream = new MemoryStream(bytes);
            var powerHttpFile = new PowerHttpFile
            {
                Stream = stream,
                ContentType = contentType,
                FileName = $"{Guid.NewGuid().ToString("n").Substring(0, 20)}.{extension}",
                ContentLength = Convert.ToInt32(stream.Length)
            };
            var uploadImageLogic = this.GetUploadFileLogic(powerHttpFile);
            var cts = new CancellationTokenSource();
            var uploadFileResult = await uploadImageLogic.UploadFileAsync(powerHttpFile, cts.Token);

            // 图片显示的相对路径
            if (!uploadFileResult.IsError)
            {
                replaceInfo.Succeed = true;

                // 图片显示路径,前缀+相对路径+图片名称
                var imagePath = uploadFileResult.RelativePath.Replace("$", "/" + GlobalUploadConfig.Instance.UploadDirectory);
                replaceInfo.ImagePath = imagePath;
                replaceInfo.FileName = uploadFileResult.FileName;

                if (!string.IsNullOrEmpty(uploadFileResult.OriginalImagePath))
                {
                    replaceInfo.OriginalImagePath = uploadFileResult.OriginalImagePath.Replace("$", "/" + GlobalUploadConfig.Instance.UploadDirectory);
                }
            }
            else
            {
                replaceInfo.ImagePath = uploadFileResult.RelativePath;
            }

            return replaceInfo;
        }
        catch (ExternalException)
        {
            throw;
        }
    }

    /// <summary>
    /// 将空格转换为nbsp;,解决word转换无法转换出回车效果的问题。
    /// </summary>
    /// <param name="htmlResult">html文档。</param>
    /// <returns>转换后的html文档。</returns>
    private string HandleEnterKey(string htmlResult)
    {
        // <p><span>&nabp;</span></p>在编辑器里显示成回车。
        htmlResult = Regex.Replace(htmlResult, "> <", ">&nbsp;<");

        // 解决word文档中有表格,表格设置了居中后编辑器无法设置位置问题
        return Regex.Replace(htmlResult, "margin-left: 0;", string.Empty);
    }

    /// <summary>
    /// 清除标签的样式名称。
    /// </summary>
    /// <param name="html">html内容。</param>
    /// <returns>清理后的html内容。</returns>
    private string ClearDocContent(string html)
    {
        if (string.IsNullOrWhiteSpace(html))
        {
            return html;
        }

        var doc = NSoup.NSoupClient.Parse(html);
        foreach (var item in doc.Children)
        {
            ClearDocContent(item);
        }

        RemoveElementsByTag("style");

        // RemoveElementsByTag("g");
        RemoveElementsByTag("defs");
        RemoveElementsByTag("path");

        // 最后要把 svg 里的内容移动出来,换成 DIV
        try
        {
            var svgs = doc.GetElementsByTag("svg");
            if (svgs.Any())
            {
                foreach (var svg in svgs)
                {
                    var texts = svg.Html();
                    var p = new Element(NSoup.Parse.Tag.ValueOf("div"), svg.BaseUri);
                    p.Html(texts);
                    svg.Before(p);
                    svg.Remove();
                }
            }

            // 替换内嵌的图片链接
            var images = doc.GetElementsByTag("img");
            if (images.Any())
            {
                foreach (var element in images)
                {
                    var href = element.Attr("xlink:href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        element.RemoveAttr("xlink:href");
                        element.Attr("src", href);
                    }
                }
            }
        }
        catch (Exception)
        {
            // ignore
        }

        // 再移除 svg
        RemoveElementsByTag("svg");

        return doc.Html();

        void RemoveElementsByTag(string tagName)
        {
            try
            {
                var elements = doc.GetElementsByTag(tagName);
                if (elements.Any())
                {
                    elements.Remove();
                }
            }
            catch (Exception)
            {
                // ignore
            }
        }

        void ClearDocContent(Element element)
        {
            if (element != null)
            {
                var className = element.ClassName();
                if (!string.IsNullOrWhiteSpace(className))
                {
                    element.RemoveAttr("class");
                }

                var tagName = element.TagName();
                foreach (var item in element.Children)
                {
                    ClearDocContent(item);
                }
            }
        }
    }

    /// <summary>
    /// 转换文档的 Heading-N,标题-N 样式为 h-N
    /// </summary>
    /// <param name="html">html文档。</param>
    /// <returns>转换后的html文档。</returns>
    private string ParseDocTitleToHtmlTitle(string html)
    {
        if (string.IsNullOrWhiteSpace(html))
        {
            return html;
        }

        var titles = titlePattern.Matches(html);
        if (titles.Count > 0)
        {
            var htmlResultBuilder = new StringBuilder(html);
            foreach (Match item in titles)
            {
                var titleHtml = item.Value;

                // 提取标题级别
                var titleLevel = titlePattern2.Match(item.Value);
                if (titleLevel.Success)
                {
                    var htmlTitle = "h" + titleLevel.Value.Split('-')[1];
                    titleHtml = titleHtml.Replace("<p", $"<{htmlTitle}").Replace("</p>", $"</{htmlTitle}>").Replace("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;", string.Empty);
                    htmlResultBuilder.Replace(item.Value, titleHtml);
                }
                else
                {
                    // 默认是 h1 标题
                    titleHtml = titleHtml.Replace("<p", $"<h1").Replace("</p>", $"</h1>").Replace("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;", string.Empty);
                    htmlResultBuilder.Replace(item.Value, titleHtml);
                }
            }

            html = htmlResultBuilder.ToString();
        }

        return html;
    }

    private UploadFileLogic GetUploadFileLogic(PowerHttpFile file)
    {
        return new UploadFileLogic(this.uploadFileContext, new LocalUploadFileProcessor(file, this.uploadFileContext, this.watermarkProvider, this.imageUtilitiesBuilderFactory, this.watermarkLogic), this.beforeUploadProvider, this.afterUploadProvider, this.uploadFileRecordHelper, this.uploadFileLogicLogger);
    }

    private class ReplaceImageInfo
    {
        public string Key { get; set; }
        public string Alt { get; set; }
        public string FileName { get; set; }
        public string ImagePath { get; set; }
        public string OriginalImagePath { get; set; }
        public bool Succeed { get; set; }
        public bool IsImage => this.Succeed && !string.IsNullOrWhiteSpace(this.ImagePath) && pictureExtensions.Any(ext => this.ImagePath.EndsWith(ext, StringComparison.OrdinalIgnoreCase));
    }`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions