-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_api.py
More file actions
177 lines (147 loc) · 6.39 KB
/
pdf_api.py
File metadata and controls
177 lines (147 loc) · 6.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
from dotenv import load_dotenv
#for file operation
import shutil
from typing import Optional
from datetime import datetime, UTC
#fastAPI
from fastapi import Depends, FastAPI, FastAPI, HTTPException, status, File, UploadFile
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
#datamodel
from sqlalchemy.orm import Session
from sqlalchemy import func
from model import SessionLocal, pdfs, changeLogs, init_db
load_dotenv() # load env
DATA_DIR = os.getenv("DATA_DIR", f"{os.getcwd()}/data") #create /data in root
init_db() # Ensure DB tables exist on startup
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # For dev only, allows localhost:3000 to connect
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
expose_headers=["X-PDF-ID", "X-Temp-Count"] # Crucial: lets React see your custom headers
)
security = HTTPBasic()
####### Dependency: DB #######
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
####### Dependency: Security #######
def authenticate(credentials: HTTPBasicCredentials = Depends(security)):
if credentials.username != os.getenv("USR") or credentials.password != os.getenv("PASSWORD"):
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Unauthorized")
return credentials.username
####### Dependency: Pydantic Models #######
class PDFStatus(BaseModel):
pdf_id: int
file_path: str
last_temp_path: Optional[str] = None
temp_count: int
####### MAIN: Routes #######
@app.get("/pdf/{file_path:path}", response_class=FileResponse)
async def get_pdf(file_path: str,
db: Session = Depends(get_db),
user: str = Depends(authenticate)):
"""
Retrieves and streams a PDF file for inline viewing.
This endpoint:
1. Validates that the file exists on the local filesystem.
2. Registers the PDF in the database if it is being accessed for the first time.
3. Returns the file as a stream with specific headers ('X-PDF-ID', 'Accept-Ranges')
required by the frontend Mozilla PDF toolkit for handling viewing and large files.
"""
# generate id for the pdf at this path -> write to a databse (pdf table) # temp sqllite would be enough
# check if the path already has an id else generate a new one
if not os.path.exists(file_path):
raise HTTPException(status_code=404,
detail = "file not found")
pdf_record = db.query(pdfs).filter(pdfs.path == file_path).first()
if not pdf_record:
pdf_record = pdfs(path = file_path)
db.add(pdf_record)
db.commit()
db.refresh(pdf_record)
# query
count = db.query(func.count(changeLogs.id)).filter(changeLogs.pdf_id == pdf_record.id).scalar()
last_log = db.query(changeLogs).filter(changeLogs.pdf_id == pdf_record.id).order_by(changeLogs.id.desc()).first()
# FileResponse automatically handles "streaming" via generator
# and "Range" requests required by PDF.js
# return the id and pdf
return FileResponse(
file_path,
media_type="application/pdf",
filename=file_path,
# 'inline' tells Mozilla Toolkit to view it, not download it
content_disposition_type="inline",
headers={
"X-PDF-ID": str(pdf_record.id),
"X-Temp-Count": str(count),
"Access-Control-Expose-Headers": "X-PDF-ID, X-Temp-Count",
"Accept-Ranges": "bytes"
}
)
@app.put("/pdf/{id}", response_model = PDFStatus, status_code=status.HTTP_200_OK)
async def save_pdf(id:int,
file: UploadFile = File(...), # Changed: Accept incoming stream
db: Session = Depends(get_db),
user: str = Depends(authenticate)):
"""
Accepts a modified PDF stream, saves it as a temp file, and overwrites the original.
This endpoint:
1. Receives an upload stream and writes it directly to disk (RAM-safe for large files).
2. Overwrites the original server file with the new content.
3. Logs the modification in the database with a timestamp.
4. Enforces a retention policy to delete old temporary files if the limit is exceeded.
"""
# write pdf id, changes log: change, time UTC, and temp file location (change log table)
pdf_record = db.query(pdfs).filter(pdfs.id == id).first()
if not pdf_record:
raise HTTPException(status_code=404, detail="PDF ID not found in database")
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
temp_filename = f"tmp_{id}_{timestamp}.pdf"
temp_path = os.path.join(DATA_DIR, temp_filename)
# perform local file action
try:
"""
'shutil.copyfileobj' reads the upload stream in small chunks
and writes them to disk immediately.
It NEVER loads the whole file into RAM.
"""
with open(temp_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
shutil.copy2(temp_path, pdf_record.path) # og file over-ride with temp file
# Close the upload file handle to free up any system resources
file.file.close()
new_log = changeLogs(pdf_id=pdf_record.id, temp_path=temp_path)
db.add(new_log)
db.commit()
# retaintion policy
all_logs = db.query(changeLogs).filter(changeLogs.pdf_id == id).order_by(changeLogs.id.asc()).all()
MAX_RETAIN = int(os.getenv("MAX_RETAIN", 5))
if len(all_logs) > MAX_RETAIN:
to_remove = all_logs[:-MAX_RETAIN] # Get everything except the latest 5
for log in to_remove:
# Remove physical file
if os.path.exists(log.temp_path):
os.remove(log.temp_path)
# Remove DB entry
db.delete(log)
db.commit()
final_count = db.query(func.count(changeLogs.id)).filter(changeLogs.pdf_id == id).scalar()
return {
"pdf_id": pdf_record.id,
"file_path": pdf_record.path,
"last_temp_path": temp_path, # changeLogs where max(id)
"temp_count": final_count # changeLogs COUNT(id)
}
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"File system error: {str(e)}")