From 1285778bf973ccbf336217a36a8193e18b288dcc Mon Sep 17 00:00:00 2001 From: EamesLionel <53807755+Lorre-Ramon@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:25:43 +0800 Subject: [PATCH] Dev/reorganize (#5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 将OpenPDF的过程单独创建为一个类 * 完善上下文管理器的进入退出在图像文件处理方面 * 添加open_PDF的init * 移动OpenPDF类进入一个新的文件,划分文件内的类作用 * 优化 * 完成图像反色检测类的编写,config defaults to False * 完善图片反色检测运行逻辑,基本完成图像提取代码 * 添加对手动创建data 和 log文件夹的注释 * 优化图片创建文件夹的逻辑 * 初步构建PDF图像提取在main中的逻辑 * 修改OpenPDF类为dataclass类 * 完成PDF图像提取的测试 * 添加计时装饰器;添加将特殊INFO信息提交至控制台的类,未实现 * 优化日志提示 * 放弃在logger中添加强制在console中输出特定INFO的做法 * 添加等待动画;发现使用守护线程不能在主线程结束之前结束输出 * 完善装饰器 * 完善docstring * 新建文本提取文件 * 添加文本提取的测试文件 * 明确了.get_text()为一个可用alias * finish text extract v1, waiting for debugging; Found document closed error * text_extract_debug1: finish closed Document problem * text_extract_debug: complete --- configs/global_configs.json | 4 +- main.py | 31 ++++- modules/PDF_text_extract.py | 127 +++++++++++++----- .../PDF_text_extract.cpython-310.pyc | Bin 2697 -> 4550 bytes modules/__pycache__/open_PDF.cpython-310.pyc | Bin 3560 -> 3878 bytes modules/open_PDF.py | 11 +- utils/MyDecorator.py | 2 +- utils/__pycache__/MyDecorator.cpython-310.pyc | Bin 1687 -> 1685 bytes 8 files changed, 136 insertions(+), 39 deletions(-) diff --git a/configs/global_configs.json b/configs/global_configs.json index f420a41..98ed1d6 100644 --- a/configs/global_configs.json +++ b/configs/global_configs.json @@ -2,7 +2,9 @@ "test_set": { "detect_inverted_img": false, "img_height_threshold": 100, - "successful_img_cnt_buffer": 5 + "successful_img_cnt_buffer": 5, + "textblock_y_threshold": 20, + "header_footer_threshold": 100 } } \ No newline at end of file diff --git a/main.py b/main.py index e78e5ea..0a9de4f 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -from modules import OpenPDF, PDFImgExtract +from modules import OpenPDF from utils import logger, getRunTime import os @@ -8,10 +8,11 @@ def main(pdf_path:str) -> None: os.makedirs("output", exist_ok=True) with OpenPDF(pdf_path, "test_set") as pdf: - extract_pdf_images(pdf) + extract_images(pdf) + extract_text(pdf) @getRunTime("提取PDF文件图片") -def extract_pdf_images(pdf:OpenPDF) -> None: +def extract_images(pdf:OpenPDF) -> None: """提取PDF文件中的图片 Args: @@ -38,8 +39,30 @@ def extract_pdf_images(pdf:OpenPDF) -> None: raise e finally: logger.info(f"pdf: {pdf.pdf_filename}完成提取图片") + +@getRunTime("提取PDF文件文本") +def extract_text(pdf:OpenPDF) -> None: + + text_coords_df_filepath = os.path.join("output", "text_coords.xlsx") + if os.path.exists(text_coords_df_filepath): + text_coords_df = pd.read_excel(text_coords_df_filepath) + else: + text_coords_df = pd.DataFrame(columns=["PDF_name", "page", "p_index", + "content", "center_x", "center_y"]) - + try: + pdf_text_extract = pdf.getPDFTextExtract() + logger.info(f"pdf: {pdf.pdf_filename}开始提取文本") + text_coords_df_temp = pdf_text_extract.main() + text_coords_df = pd.concat([text_coords_df, text_coords_df_temp], ignore_index=True) + text_coords_df.to_excel(text_coords_df_filepath, index=False) + except Exception as e: + logger.error(f"Error: pdf: {pdf.pdf_filename}\n\t{e}") + raise e + finally: + logger.info(f"pdf: {pdf.pdf_filename}完成提取文本") + + if __name__ == "__main__": pdf_path = "data/SUS/2022/00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pdf" diff --git a/modules/PDF_text_extract.py b/modules/PDF_text_extract.py index 1f0be14..5ed1805 100644 --- a/modules/PDF_text_extract.py +++ b/modules/PDF_text_extract.py @@ -1,32 +1,11 @@ from utils import logger from modules import OpenPDF -import os import pandas as pd -import pymupdf -import pdb -from dataclasses import dataclass from typing import Tuple, List, Any import warnings warnings.filterwarnings("ignore") - -@dataclass -class PDFTextBlock: - pdf_filename: str - pdf_page: int - pdf_page_height: float - text_index: int - # img_info: Tuple[int,int,int,int,int,str,str,str,str,int] - # img_bytes: bytes = None - # img_xref: str = None - # base_img: dict = None - # img_lazy_open: Image.Image = None - x0: int = None - y0: int = None - x1: int = None - y1: int = None - center_coord: Tuple[int,int] = None class PDFTextExtract: def __init__(self, pdf_instance: OpenPDF) -> None: @@ -35,15 +14,67 @@ def __init__(self, pdf_instance: OpenPDF) -> None: Args: pdf_instance (OpenPDF): 打开的PDF实例 """ - self.pdf = pdf_instance + self.pdf = pdf_instance + + self.textblock_y_threshold = self.pdf.global_config['textblock_y_threshold'] + self.header_footer_threshold = self.pdf.global_config['header_footer_threshold'] - def __post_init__(self) -> None: - """__init__函数的后处理函数""" - pass - - def main(self) -> None: - """主函数""" - pass + def main(self) -> pd.DataFrame: + """主函数 + + Returns: + pd.DataFrame: 提取的PDF文本信息 + """ + paragraphs:List[str] = [] + coordinates:List[Tuple[float, float]] = [] + text_df:pd.DataFrame = pd.DataFrame(columns=["PDF_name", "page", "p_index", "content", "center_x", "center_y"]) + + for page_num in range(self.pdf.pdf_page_count): + blocks = self.extractTextListInfo(page_num) + if blocks == []: + continue + + page_height = self.pdf.pdf[page_num].rect.height + + init_text = "" + x0_init, y0_init, x1_init, y1_init = blocks[0][:4] + + for block in blocks: + if self.extractTextInfo(block) is None: + continue + else: + x0, y0, x1, y1, text = self.extractTextInfo(block) + + y_close_enough = abs(y1 - y1_init) < self.textblock_y_threshold + # print(y_close_enough) + if y_close_enough: + init_text += text.strip() + x1_init = max(x1, x1_init) + y1_init = max(y1, y1_init) + else: + paragraphs.append(init_text.replace(" ", "")) + coordinates.append(((x0_init + x1_init) / 2, (y0_init + y1_init) / 2)) + + init_text = text.strip() + x0_init, y0_init, x1_init, y1_init = x0, y0, x1, y1 + + # save the last paragraph + paragraphs.append(init_text.replace(" ", "")) + coordinates.append(((x0_init + x1_init) / 2, (y0_init + y1_init) / 2)) + + # save the extracted text to a dataframe + index = 0 + for para, coord in zip(paragraphs, coordinates): + center_x, center_y = coord + if not self.isHeaderOrFooter(center_y, page_height): + info = pd.Series(["", page_num+1, index, para, center_x, center_y], + index=["PDF_name", "page", "p_index", + "content", "center_x", "center_y"]) + text_df = pd.concat([text_df, pd.DataFrame([info])], ignore_index=True) + + index += 1 + + return text_df def extractTextListInfo(self, page_num: int) -> List[Tuple[float, float, float, float, str, Any, Any]]: """提取PDF中某页的全部文本段信息 @@ -74,10 +105,42 @@ def extractTextListInfo(self, page_num: int) -> List[Tuple[float, float, float, raise e return [] - def extractTextInfo(self, block:List): - + def extractTextInfo(self, block:List) -> List[Tuple[float,float,float,float,str]]: + """提取PDF单个文本段的信息 + + Args: + block (List) + + Returns: + List[float,float,float,float,str]: 具体地址信息和文本内容 + List[0]: block_x0: 文本段左上角x坐标 + List[1]: block_y0: 文本段左上角y坐标 + List[2]: block_x1: 文本段右下角x坐标 + List[3]: block_y1: 文本段右下角y坐标 + List[4]: block_text: 文本段内容 + """ match block: case _ if " bool: + """判断文本段是否为页眉或页脚 + + Args: + center_y (float): 文本段中心y坐标 + page_height (float): PDF页高 + + Returns: + bool: 是否为页眉或页脚 + """ + match center_y: + case _ if center_y < self.header_footer_threshold: + return True + case _ if center_y > (page_height - self.header_footer_threshold): + return True + case _: + return False \ No newline at end of file diff --git a/modules/__pycache__/PDF_text_extract.cpython-310.pyc b/modules/__pycache__/PDF_text_extract.cpython-310.pyc index b520a2c233f3c90edae90e535d82ffc02655a58c..3d865c69090fa2beb1234604d1fdc06a5c524257 100644 GIT binary patch literal 4550 zcmbtX>u(#!5#PNh-=Zm7kEFHPpe@9>AaWm|ZLnwVk)lN2(R;_3 z6z&Q)vSX*VVIzUnAhBAh$g6E)Cuq~CaUA!1|AAsr^hYf^cg9iAo4iMMOI*Rl*3G` z+U2rIfz$WZP3whSd&Y7yQ};LORg)>-uQ)D~pRrm@tCTI9nnkX*_DeuzClu#A5EJDw zL3xQ_QzjCTOA1kldR)mXL^9P$tt1IUl)O$NW(3-zX3UJYe+e^bCQJGt8bqKMB{9%T zk+>OMFIQ6AG6|9dnI4id_QWTs0RNs~>`x=AcE!&0!K{btkM-E^r@bXlTKN`;E$ zxQ10UnMO^wL9MZ(1lD&x&hzpt6zA(eJnW$Y^3Wuj!~_olSCHB@Nz`O9nbN3$aByla zc<*#@X7<*JuYY_c7R$E3XK2|O?bdkXTC1`{?UTrjW|v=mJ^1|C^*4{hXu+jlE`4#j ztH!ptu&%>ZyEs*772FAGIumx4u>J|tASNx8Y}*r*nYwpFT> zX&j8E{1|f*QyjBeqEV>)2GKP9-L!^(;F#1IuGH$(eyQS^!@Epp%C+mmdm9EdEY~zi zb}uzZ!*ES!c>jdyRGi`6<9m52RBGh{HS4xhac$b-P0!cL!!?^Usvrz&D)2QFpzy=l zUT?8T0Zyml77E-fM+2g1DJg@e`)}RP?H#!y2AR$`2n@Gg9NnLxr;L0IMRMWj$^fQc zVSPbE+% zD$!2B(H~U&h#&Q1e%w=GoFDoL7$@Q-7BEp8i0(Y{c&C0)^^<u$p9@nSQQ9|I0#fs4afcs}Xr^C_Yec|qd9_9SSD zc(Hy&A`c=j>cvB@6!wboz~-cvg7MNNW!2vQe>9@_y>1UcL7I_Zts@5|f3v^E+XDZ6 zem|@;?e%-wg5>-a_8QRV0fzgL*XwQe`n>+K*53WtS;-#&iHs+^XdZh5WxRmP5{Z)p z^vGT>iNX%_9{B+FpF65#qX6f*w0I>rd3E{1N8Q`sWnL5rz70;>cNwm+hZ;2#Ky>bW zaBemP(emuc<%=IIU0YcG+2!sSIgLg5Wwbz?DRrZ4GQAG>keE%T158}ga#^GZ6adxi zaxLyPi*~h9vmAyfzldWi3ecN4yTU!l3U2}-RSdNsxJ2ki!?AO{OsW$W?V8F|Koof6 zB&e|z_X*q<0BQr~?`@-mdo%8rpSMajQ@|y=Oq(z((n_7l zHKWNiqYkc~Fdd*-HNXwZ)6kW!IL`^sv5)Q%u7hdgCastb(_jmVhMOCtJz!WqRtMmh zsZpPBSiET4lvFGZ{=6>03?3#T%(faerV0N9a-tAGSTIhgiuE3TF zw-%C3qOoRFEC|HNc@Bu8DL9SOQVd711pkUOfFn{0_{~xVY6%>}37nC(LTew+$Xl@@ zXFxWA`>=+!|2!baBLojU%A5%KypQ`c@Da+(;EjCcr^-=$ z6y>zW3ve?-M1YA)iytok>io*WCxDLN#QQ5V?}q?czVt~OBVCYe5VZK^gj3EARV){R z6^sYNgyX@G9IjpM7Y&FV>re{Ne8#eCMzxhSn?w3MgQThesjm#`Op1+Y4P+u%>PJNzqO(M-R6I^ ztIvV7)`C|~1egAF$8|q83W~vrKQ3LF?aI(c;qKm4!DEePZ|YeN_R(yEqiS_y!#fwe z{qc=Iyfc>5LMIm~o36+KSaf&i3Q+DknQBsM(=cBvRjMZU%{f(gAb}&g8KlI#44dmf zO}_?x`=B^nNXh_eB@L(WmjJyAjvSNzJ^j!+e6I1F9p_3P2jyGfszXCJy&zEWFy!0V zgQQ{*y@cpC9?}4aXi}P#IT)b_l2zcePzE@CPUo;Bk#QJ`_mikdB~%~6pRf3;uX&OO z;nPz+t)xN-j5Yoaaij~`;OvEU*ZuBD756+l#8;NP2TUC(Rc*ul*6kF6h&cRDm#)4Z zT>L1QI~L9zymlr8Q1CzQ&V2#yzR-4GP`fXj-7CnZ^`|N|jvD$PTr2uLPIhv_Gl*$D z8$z_fJC_ze3*P!7m^s(p`ng~H^Yb&owX4@Jyfe1K14>R22%_KR`aG4OyLhV}VuxR9 z`UEH6=Hy9EMmhNoCp_n04UTjOs#PdX3W$_S;cozbB#3n>JiYbK2;#Yi2)Xnz&@0M7 zIv?kG5fN_$7!NB5q7=`C(G;R_4yf_4MZ~rNG^Nljye$e3V1R?t_#3DNr`}$k{jlq` zm*2P?oCnAK84qO_UtK;m3(Sq0#Th!Yp^PwG(0X!y6ojPck_?G@ibQy=3aF@xkV-GUSkBJu)~eTDjlI$) zsuq*2XPGxz(!J0{}?*=X< zz^$a-_X4l)2Y$a1G^#x#X$DQ%{Hi7FuPG(ceUAu7@BzbX zlB{z_djru8yKy4Zs4q1~ZxHol&^%`0Z5j7&7lB*AJ7Js(Ioj52bV;*aSfk5eFKe$O z(?Y7SlVwV@{qnQc`gLvOh0@J1jM6kKq9V@HJPfrFr-inSWe#q~C~K*SSFYqGtSqmI8bD%3dY z6tDd+vV&@luUU0!4R+gXh?W2I$Dd~uVB-2RO=Csp8T-rB5;>7)}r&Rr$~uLYybcN diff --git a/modules/__pycache__/open_PDF.cpython-310.pyc b/modules/__pycache__/open_PDF.cpython-310.pyc index 3dbbb471ea3d1ce004c72ad9b6d2612e0633ab4c..feb308b6453cd7c6e61f8d08b4249af5c9a08811 100644 GIT binary patch delta 413 zcmaDMy-bcTpO=@50SM0g(Mn&yJCScWW7WiMa`F->tSM|QEK#f}5-IE{96%OZiew6B z3Kx*Yo+2^vv!Vsc3`Bgjf9U;@%0>FjyW`X@Z?pZ088 z`?EP~CO_deltePdN&!Vymq(%A4ye&jlj)WK(8Bl>_>; z3rq+AS+@k!Q%g`?nFLZ`3nZ!}{R>j_fRcI`YJeuP162U61k;nx@(A%lm=FtjCqLkc xWU~R8 delta 140 zcmZ1`_d=R4pO=@50SLIysHAuDOypb6STJ#$oV;iXYYJNnOB8F0XbO7@2av^&az&iV`9~Yq%IVm^c`@n7BZIpN~yQh)-bhM!r%2pz|O9 diff --git a/modules/open_PDF.py b/modules/open_PDF.py index 7d91a43..eb110ef 100644 --- a/modules/open_PDF.py +++ b/modules/open_PDF.py @@ -93,4 +93,13 @@ def getPDFImgExtract(self) -> "PDFImgExtract": PDFImgExtract: 提取图片的子类实例 """ from .PDF_img_extract import PDFImgExtract - return PDFImgExtract(self) \ No newline at end of file + return PDFImgExtract(self) + + def getPDFTextExtract(self) -> "PDFTextExtract": + """返回提取文本的子类实例 + + Returns: + PDFTextExtract: 提取文本的子类实例 + """ + from .PDF_text_extract import PDFTextExtract + return PDFTextExtract(self) \ No newline at end of file diff --git a/utils/MyDecorator.py b/utils/MyDecorator.py index 5f20089..9494a9c 100644 --- a/utils/MyDecorator.py +++ b/utils/MyDecorator.py @@ -22,7 +22,7 @@ def wrapper(*args, **kwargs): # start pending_animation daemon thread stop_event = threading.Event() - loader_thread = threading.Thread(target=pending_animation, args=(stop_event, f"运行{function_name}中",)) + loader_thread = threading.Thread(target=pending_animation, args=(stop_event, f"{function_name}中",)) loader_thread.daemon = True # 设置为守护线程 loader_thread.start() diff --git a/utils/__pycache__/MyDecorator.cpython-310.pyc b/utils/__pycache__/MyDecorator.cpython-310.pyc index 030010fbf06a7caf90af144326726309c29e35f8..8771a7a232c0bd26fe1d01a0e2bfedfa496146ef 100644 GIT binary patch delta 55 zcmbQvJC&C=pO=@50SI3I)=HP#$UB{hQEu{5CRL6O#@P%h%yXG0UtzLjblJ?utii-6 KvDut8fe`?WPYuBU delta 57 zcmbQrJDry|pO=@50SIm{P)%3a$UB{hQDO2@CRMH!#tz2W3@OZWnI~UjvSf7K%)_j~ M#3;4dlr@170F{Rgz5oCK