-
Notifications
You must be signed in to change notification settings - Fork 678
Description
Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. But it removing text which are just above of the result.
Please help me for this strange issue.
I am also attaching the original file and converted file.
DRAFT_Executive.pdf
highlighted_file.pdf
Reproduce step
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"
pattern = r'[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*]' # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()
for page in doc:
text = page.get_text()
tagsPerPage[page.number]=[]
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if matches:
for match in matches:
start, end = match.span()
coordinates = page.search_for(match.group())
tempDict={}
firstCoordStr = ""
singleTagArr = []
needleStarted=0
for rect in coordinates:
x1, y2, x2, y1 = rect
height = y2 - y1
y1 = page.rect.height - y1
y2 = page.rect.height - y2
currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
currText = page.get_text("text",clip=rect)
currTextTrimmed = ''.join(currText.split())
tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if(tagComplete):
if currCoordsStr in addedTags:
pass
else:
addedTags.add(currCoordsStr)
tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
else:
if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
needleStarted = 1
tempDict = {}
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
needleStarted = 0
currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)
if currCoordsStr in addedTags:
tempDict = {}
pass
else:
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
addedTags.add(currCoordsStr)
tagsPerPage[page.number].append(tempDict)
resultOutput.append(tempDict)
tempDict = {}
else:
if "x2" in tempDict and x2 < tempDict["x2"]:
x2 = tempDict["x2"]
if "x1" in tempDict and x1 > tempDict["x1"]:
x1 = tempDict["x1"]
tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }
if tagsPerPage:
for page in doc:
if tagsPerPage[page.number]:
for item in tagsPerPage[page.number]:
currPage= page.number+1
if item['page']==currPage:
y1 = page.rect.height - item['y1']
y2 = page.rect.height - item['y2']
page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)
page.apply_redactions()
doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()
Configuration
OS ubuntu
Python 3.8
PyMuPDF 1.19.6