Getting text remove issue from pdf after search and redaction

Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. **But it removing text which are just above of the result.**
Please help me for this strange issue.
I am also attaching the original file and converted file.
[DRAFT_Executive.pdf](https://github.com/pymupdf/PyMuPDF/files/11663707/DRAFT_Executive.pdf)
[highlighted_file.pdf](https://github.com/pymupdf/PyMuPDF/files/11663709/highlighted_file.pdf)

**Reproduce step**
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"

pattern = r'\[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*\]'  # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()

for page in doc:
    text = page.get_text()    
    tagsPerPage[page.number]=[]
    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
    if matches:        
        for match in matches:
            start, end = match.span()
            coordinates = page.search_for(match.group())  
            tempDict={}               
            firstCoordStr = ""
            singleTagArr = []
            needleStarted=0          
            for rect in coordinates:                
                x1, y2, x2, y1 = rect

                height = y2 - y1

                y1 = page.rect.height - y1
                y2 = page.rect.height - y2  

 
                currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
                currText = page.get_text("text",clip=rect)
                currTextTrimmed = ''.join(currText.split())  
                tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
                if(tagComplete):
                    if currCoordsStr in addedTags:                        
                        pass     
                    else:            
                        addedTags.add(currCoordsStr)
                        tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                        resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                else:
                    if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
                        needleStarted = 1  
                        tempDict = {}     
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }                                                   
                    elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
                        needleStarted = 0       
                        currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)                        
                        if currCoordsStr in addedTags:     
                            tempDict = {}                   
                            pass     
                        else:            
                            if "x2" in tempDict and x2 < tempDict["x2"]:
                                x2 = tempDict["x2"]
                            if "x1" in tempDict and x1 > tempDict["x1"]:
                                x1 = tempDict["x1"]  
                            tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }     
                            addedTags.add(currCoordsStr)
                            tagsPerPage[page.number].append(tempDict)
                            resultOutput.append(tempDict)
                            tempDict = {}
                    else:
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }             

if tagsPerPage:
    for page in doc:    
        if tagsPerPage[page.number]:
            for item in tagsPerPage[page.number]:
                currPage= page.number+1
                if item['page']==currPage:
                    y1 = page.rect.height - item['y1']
                    y2 = page.rect.height - item['y2']
                    
                    
                    page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)

                page.apply_redactions()
                
doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()        

**Configuration**
OS ubuntu
Python 3.8
PyMuPDF 1.19.6        

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Getting text remove issue from pdf after search and redaction #2456

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Getting text remove issue from pdf after search and redaction #2456

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions