-
Notifications
You must be signed in to change notification settings - Fork 678
Description
Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. But it returning extra height in compare of original pdf file. And also removing the string which overlapped by this string.
Please help me for this strange issue.
I am also attaching the original file and converted file.
DRAFT_Executive.pdf
highlighted_file.pdf
Reproduce step
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"
pattern = r'[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*]' # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()
for page in doc:
text = page.get_text()
tagsPerPage[page.number]=[]
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if matches:
for match in matches:
start, end = match.span()
coordinates = page.search_for(match.group())
tempDict={}
firstCoordStr = ""
singleTagArr = []
needleStarted=0
for rect in coordinates:
x1, y2, x2, y1 = rect
height = y2 - y1
y1 = page.rect.height - y1
y2 = page.rect.height - y2
currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
currText = page.get_text("text",clip=rect)
currTextTrimmed = ''.join(currText.split())
tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if(tagComplete):
if currCoordsStr in addedTags:
pass
else:
addedTags.add(currCoordsStr)
tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
else:
if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
needleStarted = 1
tempDict = {}
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
needleStarted = 0
currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)
if currCoordsStr in addedTags:
tempDict = {}
pass
else:
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
addedTags.add(currCoordsStr)
tagsPerPage[page.number].append(tempDict)
resultOutput.append(tempDict)
tempDict = {}
else:
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
if tagsPerPage:
for page in doc:
if tagsPerPage[page.number]:
for item in tagsPerPage[page.number]:
currPage= page.number+1
if item['page']==currPage:
y1 = page.rect.height - item['y1']
y2 = page.rect.height - item['y2']
page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)
page.apply_redactions()
doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()
Configuration
OS ubuntu
Python 3.8
PyMuPDF 1.19.6