-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCodeConverter.py
More file actions
61 lines (54 loc) · 1.92 KB
/
CodeConverter.py
File metadata and controls
61 lines (54 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import docx
import pytesseract
from PIL import Image
import aspose.words as aw
import os
import pandas as pd
import PyPDF2
class FileConverter:
path = ""
tesseractFilePath=""
def __init__(self, p,t):
self.path = p
self.tesseractFilePath=t
def convert_tostr(self):
imgSuffixes = (".jpg", ".png", ".gif", ".jpeg", ".raw", ".cr2", ".nef", ".orf", ".sr2")
if self.path.endswith(".docx"):
doc = docx.Document(self.path)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
text = '\n'.join(fullText)
# print(text)
elif self.path.endswith(".txt"):
with open(self.path,encoding='utf-8') as f:
text=f.readlines()
# print(text)
elif self.path.endswith(".pdf"):
pdfFileObj = open(self.path, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
text=""
y = pdfReader.numPages
for x in range(0,y):
pageObj = pdfReader.getPage(x)
text=text+pageObj.extract_text()
x=x+1
# print(text)
elif self.path.endswith(".csv"):
df1 = pd.read_csv(self.path)
text = df1.to_string()
#print(text)
elif self.path.endswith(".xlsx"):
df1 = pd.read_excel(self.path)
text = df1.to_string()
#print(text)
elif self.path.endswith(imgSuffixes):
pytesseract.pytesseract.tesseract_cmd = self.tesseractFilePath
text = pytesseract.image_to_string(Image.open(self.path))
#print(text)
return text
# if __name__ == "__main__":
# f = fileConverter('C:\\Users\\adity\\Downloads\\itotext.png')
# f.convert_tostr()
# # convert_tostr('C:\\Backup Project\\1.pdf')
# # convert_tostr('C:\\Users\\adity\\Downloads\\itotext.png')