1、环境:使用的.Net8,跨平台应用Web网站。使用的组件版本
2、windows下使用iis部署,转换PDF文件为HTML字符串是成功的。
linux下,有的PDF文件导入是成功的,有的不行。
同一个PDF文件,在linux下(Rocky10)转换报错:
Stack overflow.
at System.String.Intern()
at spr毗.⨫(System.Collections.ArrayList, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.⻖(System.String)
at spr팷.⨫(spr毗, System.String, Boolean)
at spr팷.⨫(System.String, spr毗, System.String, Boolean)
at spr팷.⨫(spr℺, System.String)
at spr팷.(spr℺)
at spr팷.⨫(Boolean)
at spr팷.㫹()
at spr㚳.⨫(Int32)
at spr㞢.⨫(Spire.Pdf.PdfDocumentBase, System.String, Boolean, Int32, Int32)
at spr㗄.⨫(Spire.Pdf.PdfDocumentBase, System.IO.Stream, Int32, Int32)
at Spire.Pdf.PdfDocumentBase.ⷧ(Int32, Int32, System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.SaveToHtml(System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.Save(System.IO.Stream, Spire.Pdf.FileFormat)
at Spire.Pdf.PdfDocument.SaveToStream(System.IO.Stream, Spire.Pdf.FileFormat)
at PowerEasy.Modules.ContentManage.Business.SpirePdfImportWordLogic.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)
at PowerEasy.Modules.ContentManage.Business.WordImportLogicWrapper.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)
还有一处错误(另一个版本8.x或9.x?):
Stack overflow.
at System.SpanHelpers.IndexOf(Char ByRef, Int32, Char ByRef, Int32)
at System.String.Replace(System.String, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)
应用程序中调用的方法代码(C#):
`SpirePdfImportWordLogic.cs类:
private static string[] pictureExtensions = new string[] { ".jpg", ".jpeg", ".bmp", ".png", ".gif", ".tiff", ".svg" };
private static Regex titlePattern = new Regex(@"<p\sclass=""((heading|标题)-+\d|(标题|Title|Heading))""?.>?.
", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static Regex titlePattern2 = new Regex(@"(heading|标题)-+\d", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// 正则表达式模式,匹配所有 base64 编码字符串
private static Regex base64Pattern = new Regex(@"data:\w+/?.*;base64,(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?", RegexOptions.IgnoreCase | RegexOptions.Compiled);
public string ConvertToHtml(PowerHttpFile file, bool removeImage = false)
{
using (var doc = new PdfDocument())
using (var memoryStream = new MemoryStream())
{
string htmlResult = string.Empty;
doc.LoadFromStream(file.Stream);
file.Stream.Close();
doc.ConvertOptions.PdfToHtmlHorizontalAlignmentCenter = true;
doc.ConvertOptions.SetPdfToHtmlOptions(true, true, 1, false);
doc.SaveToStream(memoryStream, FileFormat.HTML);
memoryStream.Seek(0, SeekOrigin.Begin);
using var streamReader = new StreamReader(memoryStream);
htmlResult = streamReader.ReadToEnd();
if (!string.IsNullOrWhiteSpace(htmlResult))
{
var sb = new StringBuilder(htmlResult);
var tagBuilder = new StringBuilder(1024);
foreach (Match match in base64Pattern.Matches(htmlResult))
{
// base64字符串类似:data:application/x-font-woff;base64,d09GRgABAAAAAC9oAA0A==
var replaceInfo = this.SaveBase64Data(match.Value).ConfigureAwait(false).GetAwaiter().GetResult();
if (replaceInfo == null)
{
continue;
}
if (replaceInfo.Succeed)
{
sb.Replace(match.Value, replaceInfo.ImagePath);
}
}
htmlResult = sb.ToString();
// 每份解析后的html文档的body下都有一对div包含内容,所以对有用内容进行截取。
var firstIndex = htmlResult.IndexOf("<body", StringComparison.Ordinal);
var lastIndex = htmlResult.LastIndexOf("</body>", StringComparison.Ordinal);
if (firstIndex > -1 && lastIndex > -1)
{
htmlResult = htmlResult.Substring(firstIndex, lastIndex - firstIndex + 6);
}
htmlResult = this.ParseDocTitleToHtmlTitle(htmlResult);
// 清除 className 和
htmlResult = this.ClearDocContent(htmlResult);
}
return this.HandleEnterKey(htmlResult);
}
}
private async Task<ReplaceImageInfo> SaveBase64Data(string base64Data)
{
/*
* 能处理的 base64 字符串类似:data:application/x-font-woff;base64,d09GRgABAAAAAC9oAA0A==
*/
var splitKey = ";base64,";
if (string.IsNullOrWhiteSpace(base64Data) || !base64Data.Contains(splitKey))
{
return null;
}
try
{
var base64Info = base64Data.Split(splitKey);
// 跳过字体处理
if (base64Info[0].Contains("x-font-woff"))
{
return null;
}
byte[] bytes = Convert.FromBase64String(base64Info[1]);
// 占位符
var replaceInfo = new ReplaceImageInfo();
// 获取文件类型
var extension = UploadFileContentTypeSecurityHelper.GetExtensionByBytes(bytes, pictureExtensions).FirstOrDefault();
if (string.IsNullOrWhiteSpace(extension))
{
replaceInfo.Alt = "不支持的文件格式";
replaceInfo.FileName = "不支持的文件格式";
return replaceInfo;
}
var contentType = UploadFileContentTypeSecurityHelper.FileContentTypes.ContainsKey(extension) ?
UploadFileContentTypeSecurityHelper.FileContentTypes[extension].FirstOrDefault() :
base64Info[0].Replace("data:", string.Empty);
if (string.IsNullOrWhiteSpace(extension))
{
extension = "jpg";
}
else
{
// 移除 .
extension = extension.Replace(".", string.Empty);
}
using Stream stream = new MemoryStream(bytes);
var powerHttpFile = new PowerHttpFile
{
Stream = stream,
ContentType = contentType,
FileName = $"{Guid.NewGuid().ToString("n").Substring(0, 20)}.{extension}",
ContentLength = Convert.ToInt32(stream.Length)
};
var uploadImageLogic = this.GetUploadFileLogic(powerHttpFile);
var cts = new CancellationTokenSource();
var uploadFileResult = await uploadImageLogic.UploadFileAsync(powerHttpFile, cts.Token);
// 图片显示的相对路径
if (!uploadFileResult.IsError)
{
replaceInfo.Succeed = true;
// 图片显示路径,前缀+相对路径+图片名称
var imagePath = uploadFileResult.RelativePath.Replace("$", "/" + GlobalUploadConfig.Instance.UploadDirectory);
replaceInfo.ImagePath = imagePath;
replaceInfo.FileName = uploadFileResult.FileName;
if (!string.IsNullOrEmpty(uploadFileResult.OriginalImagePath))
{
replaceInfo.OriginalImagePath = uploadFileResult.OriginalImagePath.Replace("$", "/" + GlobalUploadConfig.Instance.UploadDirectory);
}
}
else
{
replaceInfo.ImagePath = uploadFileResult.RelativePath;
}
return replaceInfo;
}
catch (ExternalException)
{
throw;
}
}
/// <summary>
/// 将空格转换为nbsp;,解决word转换无法转换出回车效果的问题。
/// </summary>
/// <param name="htmlResult">html文档。</param>
/// <returns>转换后的html文档。</returns>
private string HandleEnterKey(string htmlResult)
{
// <p><span>&nabp;</span></p>在编辑器里显示成回车。
htmlResult = Regex.Replace(htmlResult, "> <", "> <");
// 解决word文档中有表格,表格设置了居中后编辑器无法设置位置问题
return Regex.Replace(htmlResult, "margin-left: 0;", string.Empty);
}
/// <summary>
/// 清除标签的样式名称。
/// </summary>
/// <param name="html">html内容。</param>
/// <returns>清理后的html内容。</returns>
private string ClearDocContent(string html)
{
if (string.IsNullOrWhiteSpace(html))
{
return html;
}
var doc = NSoup.NSoupClient.Parse(html);
foreach (var item in doc.Children)
{
ClearDocContent(item);
}
RemoveElementsByTag("style");
// RemoveElementsByTag("g");
RemoveElementsByTag("defs");
RemoveElementsByTag("path");
// 最后要把 svg 里的内容移动出来,换成 DIV
try
{
var svgs = doc.GetElementsByTag("svg");
if (svgs.Any())
{
foreach (var svg in svgs)
{
var texts = svg.Html();
var p = new Element(NSoup.Parse.Tag.ValueOf("div"), svg.BaseUri);
p.Html(texts);
svg.Before(p);
svg.Remove();
}
}
// 替换内嵌的图片链接
var images = doc.GetElementsByTag("img");
if (images.Any())
{
foreach (var element in images)
{
var href = element.Attr("xlink:href");
if (!string.IsNullOrWhiteSpace(href))
{
element.RemoveAttr("xlink:href");
element.Attr("src", href);
}
}
}
}
catch (Exception)
{
// ignore
}
// 再移除 svg
RemoveElementsByTag("svg");
return doc.Html();
void RemoveElementsByTag(string tagName)
{
try
{
var elements = doc.GetElementsByTag(tagName);
if (elements.Any())
{
elements.Remove();
}
}
catch (Exception)
{
// ignore
}
}
void ClearDocContent(Element element)
{
if (element != null)
{
var className = element.ClassName();
if (!string.IsNullOrWhiteSpace(className))
{
element.RemoveAttr("class");
}
var tagName = element.TagName();
foreach (var item in element.Children)
{
ClearDocContent(item);
}
}
}
}
/// <summary>
/// 转换文档的 Heading-N,标题-N 样式为 h-N
/// </summary>
/// <param name="html">html文档。</param>
/// <returns>转换后的html文档。</returns>
private string ParseDocTitleToHtmlTitle(string html)
{
if (string.IsNullOrWhiteSpace(html))
{
return html;
}
var titles = titlePattern.Matches(html);
if (titles.Count > 0)
{
var htmlResultBuilder = new StringBuilder(html);
foreach (Match item in titles)
{
var titleHtml = item.Value;
// 提取标题级别
var titleLevel = titlePattern2.Match(item.Value);
if (titleLevel.Success)
{
var htmlTitle = "h" + titleLevel.Value.Split('-')[1];
titleHtml = titleHtml.Replace("<p", $"<{htmlTitle}").Replace("</p>", $"</{htmlTitle}>").Replace(" ", string.Empty);
htmlResultBuilder.Replace(item.Value, titleHtml);
}
else
{
// 默认是 h1 标题
titleHtml = titleHtml.Replace("<p", $"<h1").Replace("</p>", $"</h1>").Replace(" ", string.Empty);
htmlResultBuilder.Replace(item.Value, titleHtml);
}
}
html = htmlResultBuilder.ToString();
}
return html;
}
private UploadFileLogic GetUploadFileLogic(PowerHttpFile file)
{
return new UploadFileLogic(this.uploadFileContext, new LocalUploadFileProcessor(file, this.uploadFileContext, this.watermarkProvider, this.imageUtilitiesBuilderFactory, this.watermarkLogic), this.beforeUploadProvider, this.afterUploadProvider, this.uploadFileRecordHelper, this.uploadFileLogicLogger);
}
private class ReplaceImageInfo
{
public string Key { get; set; }
public string Alt { get; set; }
public string FileName { get; set; }
public string ImagePath { get; set; }
public string OriginalImagePath { get; set; }
public bool Succeed { get; set; }
public bool IsImage => this.Succeed && !string.IsNullOrWhiteSpace(this.ImagePath) && pictureExtensions.Any(ext => this.ImagePath.EndsWith(ext, StringComparison.OrdinalIgnoreCase));
}`
1、环境:使用的.Net8,跨平台应用Web网站。使用的组件版本
2、windows下使用iis部署,转换PDF文件为HTML字符串是成功的。
linux下,有的PDF文件导入是成功的,有的不行。
同一个PDF文件,在linux下(Rocky10)转换报错:
Stack overflow.
at System.String.Intern()
at spr毗.⨫(System.Collections.ArrayList, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.⻖(System.String)
at spr팷.⨫(spr毗, System.String, Boolean)
at spr팷.⨫(System.String, spr毗, System.String, Boolean)
at spr팷.⨫(spr℺, System.String)
at spr팷.(spr℺)
at spr팷.⨫(Boolean)
at spr팷.㫹()
at spr㚳.⨫(Int32)
at spr㞢.⨫(Spire.Pdf.PdfDocumentBase, System.String, Boolean, Int32, Int32)
at spr㗄.⨫(Spire.Pdf.PdfDocumentBase, System.IO.Stream, Int32, Int32)
at Spire.Pdf.PdfDocumentBase.ⷧ(Int32, Int32, System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.SaveToHtml(System.IO.Stream)
at Spire.Pdf.PdfDocumentBase.Save(System.IO.Stream, Spire.Pdf.FileFormat)
at Spire.Pdf.PdfDocument.SaveToStream(System.IO.Stream, Spire.Pdf.FileFormat)
at PowerEasy.Modules.ContentManage.Business.SpirePdfImportWordLogic.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)
at PowerEasy.Modules.ContentManage.Business.WordImportLogicWrapper.ConvertToHtml(PowerEasy.Foundation.UploadBackground.PowerHttpFile, Boolean)
还有一处错误(另一个版本8.x或9.x?):
Stack overflow.
at System.SpanHelpers.IndexOf(Char ByRef, Int32, Char ByRef, Int32)
at System.String.Replace(System.String, System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
at spr毗.Ⰹ(System.String)
......(中间省略很多相同的内容)
应用程序中调用的方法代码(C#):
", RegexOptions.IgnoreCase | RegexOptions.Compiled);`SpirePdfImportWordLogic.cs类:
private static string[] pictureExtensions = new string[] { ".jpg", ".jpeg", ".bmp", ".png", ".gif", ".tiff", ".svg" };
private static Regex titlePattern = new Regex(@"<p\sclass=""((heading|标题)-+\d|(标题|Title|Heading))""?.>?.
private static Regex titlePattern2 = new Regex(@"(heading|标题)-+\d", RegexOptions.IgnoreCase | RegexOptions.Compiled);
public string ConvertToHtml(PowerHttpFile file, bool removeImage = false)
{
using (var doc = new PdfDocument())
using (var memoryStream = new MemoryStream())
{
string htmlResult = string.Empty;