Skip to content

Getting text remove issue from pdf after search and redaction #2456

@ashifaliclientpoint

Description

@ashifaliclientpoint

Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. But it removing text which are just above of the result.
Please help me for this strange issue.
I am also attaching the original file and converted file.
DRAFT_Executive.pdf
highlighted_file.pdf

Reproduce step
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"

pattern = r'[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*]' # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()

for page in doc:
text = page.get_text()
tagsPerPage[page.number]=[]
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if matches:
for match in matches:
start, end = match.span()
coordinates = page.search_for(match.group())
tempDict={}
firstCoordStr = ""
singleTagArr = []
needleStarted=0
for rect in coordinates:
x1, y2, x2, y1 = rect

            height = y2 - y1

            y1 = page.rect.height - y1
            y2 = page.rect.height - y2  


            currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
            currText = page.get_text("text",clip=rect)
            currTextTrimmed = ''.join(currText.split())  
            tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
            if(tagComplete):
                if currCoordsStr in addedTags:                        
                    pass     
                else:            
                    addedTags.add(currCoordsStr)
                    tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                    resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
            else:
                if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
                    needleStarted = 1  
                    tempDict = {}     
                    if "x2" in tempDict and x2 < tempDict["x2"]:
                        x2 = tempDict["x2"]
                    if "x1" in tempDict and x1 > tempDict["x1"]:
                        x1 = tempDict["x1"]  
                    tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }                                                   
                elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
                    needleStarted = 0       
                    currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)                        
                    if currCoordsStr in addedTags:     
                        tempDict = {}                   
                        pass     
                    else:            
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }     
                        addedTags.add(currCoordsStr)
                        tagsPerPage[page.number].append(tempDict)
                        resultOutput.append(tempDict)
                        tempDict = {}
                else:
                    if "x2" in tempDict and x2 < tempDict["x2"]:
                        x2 = tempDict["x2"]
                    if "x1" in tempDict and x1 > tempDict["x1"]:
                        x1 = tempDict["x1"]  
                    tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }             

if tagsPerPage:
for page in doc:
if tagsPerPage[page.number]:
for item in tagsPerPage[page.number]:
currPage= page.number+1
if item['page']==currPage:
y1 = page.rect.height - item['y1']
y2 = page.rect.height - item['y2']

                page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)

            page.apply_redactions()

doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()

Configuration
OS ubuntu
Python 3.8
PyMuPDF 1.19.6

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions