page.search_for return extra height

Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. But it returning extra height in compare of original pdf file. And also removing the string which overlapped by this string.
Please help me for this strange issue.
I am also attaching the original file and converted file.
[DRAFT_Executive.pdf](https://github.com/pymupdf/PyMuPDF/files/11663707/DRAFT_Executive.pdf)
[highlighted_file.pdf](https://github.com/pymupdf/PyMuPDF/files/11663709/highlighted_file.pdf)

**Reproduce step**
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"

pattern = r'\[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*\]'  # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()

for page in doc:
    text = page.get_text()    
    tagsPerPage[page.number]=[]
    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
    if matches:        
        for match in matches:
            start, end = match.span()
            coordinates = page.search_for(match.group())  
            tempDict={}               
            firstCoordStr = ""
            singleTagArr = []
            needleStarted=0          
            for rect in coordinates:                
                x1, y2, x2, y1 = rect

                height = y2 - y1

                y1 = page.rect.height - y1
                y2 = page.rect.height - y2  

 
                currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
                currText = page.get_text("text",clip=rect)
                currTextTrimmed = ''.join(currText.split())  
                tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
                if(tagComplete):
                    if currCoordsStr in addedTags:                        
                        pass     
                    else:            
                        addedTags.add(currCoordsStr)
                        tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                        resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                else:
                    if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
                        needleStarted = 1  
                        tempDict = {}     
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }                                                   
                    elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
                        needleStarted = 0       
                        currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)                        
                        if currCoordsStr in addedTags:     
                            tempDict = {}                   
                            pass     
                        else:            
                            if "x2" in tempDict and x2 < tempDict["x2"]:
                                x2 = tempDict["x2"]
                            if "x1" in tempDict and x1 > tempDict["x1"]:
                                x1 = tempDict["x1"]  
                            tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }     
                            addedTags.add(currCoordsStr)
                            tagsPerPage[page.number].append(tempDict)
                            resultOutput.append(tempDict)
                            tempDict = {}
                    else:
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }             

if tagsPerPage:
    for page in doc:    
        if tagsPerPage[page.number]:
            for item in tagsPerPage[page.number]:
                currPage= page.number+1
                if item['page']==currPage:
                    y1 = page.rect.height - item['y1']
                    y2 = page.rect.height - item['y2']
                    
                    
                    page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)

                page.apply_redactions()
                
doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()        

Configuration
OS ubuntu
Python 3.8
PyMuPDF 1.19.6        

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

page.search_for return extra height #2453

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

page.search_for return extra height #2453

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions