We are using bytescout to convert pdf to text, now switching to mupdf.net but not getting expected result same as bytescout.
Below is my existing method
private MemoryStream ConvertPdfToTxt(Stream pdf)
{
StringBuilder sbRow = new();
// create object of pdf extractor file reader.
XMLExtractor extractor = new()
{
RegistrationName = config["RegistrationName"],
RegistrationKey = config["RegistrationKey"],
AutoAlignColumnsToHeader = true,
ConsiderVerticalBorders = true,
ExtractAnnotations = true,
ExtractColumnByColumn = true,
ExtractInvisibleText = true,
ExtractShadowLikeText = true
};
//Load pdf file to file extractor.
extractor.LoadDocumentFromStream(pdf);
// Get string from pdf stream
string doc = extractor.GetXML();
XmlDocument xml = new();
// Create xml document object.
xml.LoadXml(doc);
// get each row form the xml document.
XmlNodeList lst = xml.GetElementsByTagName("row");
foreach (XmlNode row in lst)
{
StringBuilder sbCol = new();
foreach (XmlNode column in row.ChildNodes)
{
//if column in row is blank then append "" in string.
if (column.InnerText.Equals(""))
{
sbCol.Append("");
}
else
{ // check if string contain "Pdf Extractor" and remove it.
if (Regex.IsMatch(column.InnerText, "PDF Extractor", RegexOptions.None, TimeSpan.FromSeconds(1)))
{
sbCol.Append("");
}
else
{ // if column has value then add it to new line for ease in txt reading
sbCol.AppendLine(column.InnerText);
}
}
}
sbRow.Append(sbCol.ToString());
}
// create new stream.
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(sbRow.ToString());
writer.Flush();
stream.Position = 0;
return stream;
}
I have tried with below code using mupdf
private MemoryStream ConvertPdfToTxt2()
{
// Create a MemoryStream to write the extracted text.
var sbRow = new StringBuilder();
// Load the PDF document using MuPDFNet
var doc = new MuPDF.NET.Document(Path.Combine(hostingEnvironment.WebRootPath, "assets", "sample.pdf"));
MuPDF.NET.TextPage textPage = doc.LoadPage(0).GetTextPage();
string extractedText = textPage.ExtractText();
string outputPath = Path.Combine(hostingEnvironment.WebRootPath, "assets", "output.txt");
// Save the extracted text to the text file
System.IO.File.WriteAllText(outputPath, extractedText);
// Iterate over each page in the document
for (int pageIndex = 0; pageIndex < doc.PageCount; pageIndex++)
{
var page = doc.LoadPage(pageIndex);
// Extract text from the page
var text = page.GetText();
// Append the extracted text to the StringBuilder
sbRow.AppendLine(text);
}
// Create a new MemoryStream to return
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(sbRow.ToString());
writer.Flush();
// Reset the position of the stream to the beginning for future reading
stream.Position = 0;
return stream;
}
We are using bytescout to convert pdf to text, now switching to mupdf.net but not getting expected result same as bytescout.
Below is my existing method
private MemoryStream ConvertPdfToTxt(Stream pdf)
{
StringBuilder sbRow = new();
// create object of pdf extractor file reader.
XMLExtractor extractor = new()
{
RegistrationName = config["RegistrationName"],
RegistrationKey = config["RegistrationKey"],
AutoAlignColumnsToHeader = true,
ConsiderVerticalBorders = true,
ExtractAnnotations = true,
ExtractColumnByColumn = true,
ExtractInvisibleText = true,
ExtractShadowLikeText = true
};
I have tried with below code using mupdf
private MemoryStream ConvertPdfToTxt2()
{
// Create a MemoryStream to write the extracted text.
var sbRow = new StringBuilder();