Skip to content

Commit dd174d8

Browse files
migrated from JSON db to SQLite; started setting up things to add lobsters support
1 parent 720edc1 commit dd174d8

4 files changed

Lines changed: 64 additions & 46 deletions

File tree

requirements.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@ beautifulsoup4==4.12.3
33
feedgen==1.0.0
44
Flask==3.1.0
55
Flask_APScheduler==1.13.1
6-
jsonpickle==3.2.2
76
pyfunctional==1.5.0
8-
pyquery==2.0.0
97
Requests==2.32.3
108
stopwatch.py==2.0.1
11-
waitress==3.0.2
9+
waitress==3.0.2
10+
peewee==3.18.3

src/app.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
import json
22
import os
33
import sys
4+
from enum import Enum
45
from pathlib import Path
5-
66
from flask import Flask, render_template_string, Response
77
from flask_apscheduler import APScheduler
8-
98
from hackernewsd import HackerNewsScraper
109
from waitress import serve
1110
import logging
1211
from logging import handlers
1312
from logging.handlers import RotatingFileHandler
13+
from peewee import SqliteDatabase
14+
from models import BaseModel, Story
1415

1516
app = Flask(__name__)
1617
scheduler = APScheduler()
@@ -73,11 +74,18 @@ def initLogger():
7374
rootLogger.addHandler(fileHandler)
7475
rootLogger.addHandler(consoleHandler)
7576

77+
def initDb():
78+
db = SqliteDatabase(Path.home() / "hnd.db")
79+
BaseModel._meta.database.initialize(db)
80+
db.connect()
81+
db.create_tables([Story])
82+
7683

7784
if __name__ == '__main__':
7885
initLogger()
86+
initDb()
7987
rcFile = json.loads(readRcFile())
8088
scheduler.init_app(app)
8189
scheduler.start()
8290
serve(app, host=rcFile["host"], port=rcFile["port"])
83-
# app.run(port=5555)
91+
# app.run(port=5555)

src/hackernewsd.py

Lines changed: 31 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,22 @@
1-
import base64
21
import json
32
import logging
43
import re
5-
import os.path
6-
import tempfile
7-
from datetime import datetime, timezone
4+
from datetime import datetime, timezone, timedelta
85
from pathlib import Path
96
import backoff
107
import bs4
118
import requests
129
from feedgen.feed import FeedGenerator
1310
from functional import seq
14-
from pyquery import PyQuery
15-
import jsonpickle
1611
from stopwatch import Stopwatch
1712

18-
HN_LOGO_B64 ="iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAYAAACtWK6eAAAJmElEQVR4nOzdbYxcZdnA8WtmZ2e73TykT5uGmBIXbUqVaGsTobSAsUatoYlS36Jf1qaKBuoHQWMiCTbFRExKAppSWxIqrhHQAi0EWtc24tJtlxaktoYt9CW28tosu5LqzszOOXPGnFM+9G2v3XPNfWZ29vx/CQkp3LPXh/5n7pm595zcmVs+fKWIhP8AON/JnIisEpG1jZ4EmITWZRs9ATCZEQigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUuUYPUA+Zy2ZI2/VfqukxSjt/62yeRsq0T5e2T3/NtDYYfkfK+3uczzSZpSKQ6n/PSHbutdKy4Avmx8jMvlKK3eucztUIuSUrpXXlT+MvHBmWkV98OYmRJrV0bLGCQAob1kh18Jj5IXI3dEnLnLlOx2qE/JKVpnXe3/8swenXnc8z2aUjkPeVn76vpvUt85c4m6URsrNmS7ZzkWlt5bV+5/M0g3QFsr+npleRlquudTpPvbXd9F3zWn+gz+kszSJVgYQKv7ol2k9b5D62rGm3WfllX5fcjatNa0tb1kj1zHvOZ2oGqQsk3EeH+2mT/HRpv+0B1yMlLjvzcmn75j2mtf6eLeK98CfnMzWL1AUiNe6nM7PnSX7xcqfzJK3lqmtsC72ijO540PU4TSWVgXgHd0tw6qB5fds3fiaZ//t/pzMlqXXpV0zrvAPbJRgadD5PM0llIDJakuKmW+3rO2ZJ6zUrXE6UmNy8BdLykU+Z1vp7tzqfp9mkM5DwvcjQYLS/tmqWT7TaVtxmWlcdPCb+qQHn8zSb1AYSKj1xX7TPtsjNv875PK6Frx7Zqz9nWht9Z+R5zmdqNqkOpFooRPtsk45Z0t611vVIzmRnXi7TVt1rWuv3dafuzNVYUh2I1LjPzt3QJa0Lljqdx5X2rnWSmR3/O5tgYNeUOHPmCoEcP1TTt+v5z3Q5nceF8NXDurUafXaj83maWeoDCRU3rhEZGTKtDf8itt30becz1SJnPJBYefV58Y8ddj5PMyOQ8C/Gmyek9JsfmNfnb74zetaeLPJLbcfSy7vtn+pNVQTyPu/wvmj/bWX+ttqx3LwFpvceocqxvzmfp9kRyDlq2X9PlqPw1u89/D1bpFosOJ+n2RHIOcL9tz/wnGlt6yc+3/BtVvheyPLmvHKkV0qP1/a7MlMVgVzA+4vxd887ZkYfrTZKGGf4Xig2ryjFzd/n1WMMBHKBWvbh4bN3+B6gEazvgSrHD0RfmOLSCOQC4TNpaUOX+QjKtFX3Sqa9w/lc47Ge2C3vecz5LFMJgVyCd3iveM+sN63NzJ4ruYXLnM+ksZ7YjQ4kvmT85bGUIJAxjPZuNb+K1PsTLesnV7VexCINCGQM4b688qrtQgW5On4nYj2xWz15gAOJE0AginLvo6Z14TarHsdPrCd2q4MnpLDJfnIgTQhE4R/uNR9kzN98Z+KfaFlP7JYe/pEEw6cTmWmqIZBxlB7+sXmt9b3BRFhP7AZvDXAgMQYCGUf4l6m8/eemtUme9LWe2C0/z8e6cRDIBIzueMh8kDGpk77WE7t+/zbns0xlBDJBxd/dZV7r+qRvfvFy03sPv+d+jpTERCATFAwNmq+l5fp7kfwXb4+9ploYltLOh5zOkQYEEkO5z/b76y5P+rZ3rY2u7hhLxZPS5ls5c2VAIDF4+56Mnoljc3TSt2XO3OhCEXFVBv4q/pGXav75aUQgcXie+D0bTEtdnPS1btX8V9J56wIXCCSm0u5HxH/5adPaWk/6ti4xfHI1MiTei8+af2baEUhcnifFTbebroJSy0nf/OLlku1cGHtdceN3pPqff5t+JgjEzH/tBdM66zbJ8slVMLCLb81rRCBGlaMHTOtarpgfe83Z7z1ifnLFReCcIBAjr39bdCo2rmznImm9buK3o26ZMze6H0lco4/+hFcPBwjEqFociU7FWkxb/UB0x9mJiG751jEr1uMHpw5K+bk/mmbD+QikBuEztPWM1kTuOBu+eli2Vj4XgHOGQGpU2HyH6QhK7sbV4261cku/Gn8grygeF2JwhkBqVC0WzLdzC7da0to65n/PG65U4j2zXoK3/2maBxcjEAeig4zHbR/75jqvvuSfR/cdifneo1oYPnuxCThDII54R/aa1mU/ev0l/9xy3xG/ZwMHEh0jEEf8/u2mywS1LvjsRX9mucZucOqQlHYaL5uKMRGII8G7b8lo9w9jr8t2LjzvNm7Wa+x6/U/GXoPxEYhD5f09pqugnLudsv72of9Kr2kddATimOVqheF2KnvFh87++wc/Hnt9MLBLgtOvx16H8RGIY+GrSOXovtjr2r+1XjKXzZDWT66Ita46eEKK3ZP3dtTNjkAS4PX9IfaabOci6bh7l2RmfCDWusKvv8dF4BJEIAmoHH3RtC4zfWas/9/fs0WCN/hSMEkEkoDwGX20+45Ef0b1vbe5bVodEEhCyn1PRc/wSfH6H+caV3VAIAka3fFgYo9d+YftZqOIh0ASFAwNirftbuePWy0Mi39qwPnj4mIEkrDS7kckOH3c3QOODEUXgRPPc/eYGBOBJM3zxDvwlLOHG33sLi4CV0cEUgeVlx3d6mxkiNum1RmB1EHlzRPi93XX/Djl3t87mQcTRyB1UuxeF904sxYeN7+pOwKpo9LWe8xrowOJHCmpOwKpI//YYdO1tISLwDUMgdRZ8O4bsdd4PfdzEbgGIZBmUDjT6AlSi0AABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEECRa/QAaRP865BkquV4a4bfSWwe6AikzkpP/LLRIyAGtliAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQABFTkROikhvowcBJqGT/wsAAP//8+CVXhPhYlQAAAAASUVORK5CYII="
19-
13+
from models import Story, StoryType
2014

2115
class HackernewsRateLimitException(Exception):
2216
pass
2317

2418

25-
class RcFile:
26-
def __init__(self, _queries):
27-
self.queries = _queries
28-
29-
30-
class HackerNewsStory:
19+
class HackerNewsStoryDto:
3120
def __init__(self, _title, _url, _hackerNewsUrl, _lastSeen, _postedDate):
3221
self.title = _title
3322
self.url = _url
@@ -46,13 +35,7 @@ def getLogger(self):
4635
rootLogger = logging.getLogger('root')
4736
return rootLogger
4837

49-
def getIconPath(self):
50-
iconBytes = base64.b64decode(HN_LOGO_B64)
51-
tempFolder = tempfile.gettempdir()
52-
iconPath = os.path.join(tempFolder, '__hnicon.png')
53-
with open(iconPath, 'wb') as iconFile:
54-
iconFile.write(iconBytes)
55-
return 'file://' + iconPath
38+
5639

5740
@backoff.on_exception(backoff.fibo, HackernewsRateLimitException)
5841
def processPage(self, pageNumber):
@@ -74,7 +57,7 @@ def processPage(self, pageNumber):
7457
if not (len(hackerNewsUrls) == len(titles) == len(urls) == len(dates)):
7558
raise Exception(f"Error in parsing page {pageNumber}: length of parsed elements is different. Hackernewsurls: {len(hackerNewsUrls)} Titles: {len(titles)} Urls: {len(urls)} Dates: {len(dates)}\n\n#Hackernewsurls\n{hackerNewsUrls}\n\n#Titles\n{titles}\n\n#Urls\n{urls}\n\n#Dates\n{dates}")
7659

77-
return seq(zip(titles, urls, hackerNewsUrls, dates)).map(lambda x: HackerNewsStory(x[0], x[1], x[2], datetime.now(timezone.utc), x[3])).to_list()
60+
return seq(zip(titles, urls, hackerNewsUrls, dates)).map(lambda x: HackerNewsStoryDto(x[0], x[1], x[2], datetime.now(timezone.utc), x[3])).to_list()
7861

7962

8063
def readRcFile(self):
@@ -104,40 +87,48 @@ def generateRss(self, stories, useHackernewsUrl=False):
10487
rssPath.write(rss)
10588

10689

90+
def cleanupOldStories(self):
91+
Story.delete().where(Story.last_seen < datetime.now(timezone.utc) - timedelta(days=7)).execute()
92+
10793
def getOldStories(self):
108-
storiesDbPath = Path.home() / ".hackernewsddb"
109-
if os.path.exists(storiesDbPath):
110-
with open(storiesDbPath, "r", encoding="utf-8") as dbFile:
111-
return jsonpickle.decode(dbFile.read())
112-
else:
94+
try:
95+
all_stories = list(Story.select().where(Story.type == StoryType.Hackernews.value))
96+
print(all_stories)
97+
return seq(all_stories).map(lambda s: self.entityToDto(s)).to_list()
98+
except Exception as e:
11399
return []
114100

115-
def writeOldStories(self, stories):
116-
storiesDbPath = Path.home() / ".hackernewsddb"
117-
with open(storiesDbPath, "w", encoding="utf-8") as dbFile:
118-
dbFile.write(jsonpickle.encode(stories))
101+
def entityToDto(self, entity : Story) -> HackerNewsStoryDto:
102+
return HackerNewsStoryDto(entity.title, entity.url, entity.hnurl, entity.last_seen, entity.posted_date)
103+
104+
def dtoToEntity(self, dto:HackerNewsStoryDto) -> Story:
105+
return Story(title=dto.title, url=dto.url,hnurl=dto.hackerNewsUrl, last_seen = dto.lastSeen, posted_date=dto.postedDate, type = StoryType.Hackernews.value)
106+
107+
108+
def insertNewStories(self, stories):
109+
for s in stories:
110+
entity = self.dtoToEntity(s)
111+
entity.save()
119112

120113
def scrape(self):
121-
self.ICON_PATH = self.getIconPath()
122114
self.logger = self.getLogger()
123115

124116
try:
125117
stopwatch = Stopwatch()
118+
self.cleanupOldStories()
126119
oldStories = self.getOldStories()
127120
rcFile = self.readRcFile()
128121
queries = json.loads(rcFile)["queries"]
129-
allStories = seq.range(1, 30).flat_map(lambda p: self.processPage(p)).to_list()
130-
filteredStories = seq(allStories).filter(lambda x: seq(queries).map(lambda q: x.title.lower().find(q.lower()) != -1).any()).to_list()
122+
allCurrentStories = seq.range(1, 30).flat_map(lambda p: self.processPage(p)).to_list()
123+
filteredStories = seq(allCurrentStories).filter(lambda x: seq(queries).map(lambda q: x.title.lower().find(q.lower()) != -1).any()).to_list()
131124
diffStories = seq(filteredStories).filter(lambda x: not seq(oldStories).filter(lambda y: y == x).any()).to_list()
125+
self.insertNewStories(diffStories)
132126

133-
134-
updatedOldStories = seq(oldStories + diffStories).filter(lambda s: (datetime.now(timezone.utc) - s.lastSeen).days < 7).to_list()
135-
self.writeOldStories(updatedOldStories)
136-
self.generateRss(updatedOldStories)
137-
self.generateRss(updatedOldStories, True)
127+
self.generateRss(self.getOldStories())
128+
self.generateRss(self.getOldStories(), True)
138129

139130
stopwatch.stop()
140131
self.logger.info(f"It took {str(stopwatch)} for a full cycle.")
141132
except Exception as e:
142133
self.logger.error(f"Unhandled exception occurred", exc_info=True)
143-
print(e)
134+
print(e)

src/models.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from enum import Enum
2+
from peewee import Model, CharField, DateField, AutoField, IntegerField, DatabaseProxy, DateTimeField
3+
4+
5+
class BaseModel(Model):
6+
class Meta:
7+
database = DatabaseProxy()
8+
9+
class StoryType(Enum):
10+
Hackernews = 1
11+
Lobsters = 2
12+
13+
class Story(BaseModel):
14+
id = AutoField() #autoincrement
15+
title = CharField()
16+
url = CharField()
17+
hnurl = CharField()
18+
last_seen = DateTimeField()
19+
posted_date = DateTimeField()
20+
type = IntegerField(choices=[(type.value, type.name) for type in StoryType], default=StoryType.Hackernews.value)

0 commit comments

Comments
 (0)