1- import base64
21import json
32import logging
43import re
5- import os .path
6- import tempfile
7- from datetime import datetime , timezone
4+ from datetime import datetime , timezone , timedelta
85from pathlib import Path
96import backoff
107import bs4
118import requests
129from feedgen .feed import FeedGenerator
1310from functional import seq
14- from pyquery import PyQuery
15- import jsonpickle
1611from stopwatch import Stopwatch
1712
18- HN_LOGO_B64 = "iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAYAAACtWK6eAAAJmElEQVR4nOzdbYxcZdnA8WtmZ2e73TykT5uGmBIXbUqVaGsTobSAsUatoYlS36Jf1qaKBuoHQWMiCTbFRExKAppSWxIqrhHQAi0EWtc24tJtlxaktoYt9CW28tosu5LqzszOOXPGnFM+9G2v3XPNfWZ29vx/CQkp3LPXh/5n7pm595zcmVs+fKWIhP8AON/JnIisEpG1jZ4EmITWZRs9ATCZEQigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUuUYPUA+Zy2ZI2/VfqukxSjt/62yeRsq0T5e2T3/NtDYYfkfK+3uczzSZpSKQ6n/PSHbutdKy4Avmx8jMvlKK3eucztUIuSUrpXXlT+MvHBmWkV98OYmRJrV0bLGCQAob1kh18Jj5IXI3dEnLnLlOx2qE/JKVpnXe3/8swenXnc8z2aUjkPeVn76vpvUt85c4m6URsrNmS7ZzkWlt5bV+5/M0g3QFsr+npleRlquudTpPvbXd9F3zWn+gz+kszSJVgYQKv7ol2k9b5D62rGm3WfllX5fcjatNa0tb1kj1zHvOZ2oGqQsk3EeH+2mT/HRpv+0B1yMlLjvzcmn75j2mtf6eLeK98CfnMzWL1AUiNe6nM7PnSX7xcqfzJK3lqmtsC72ijO540PU4TSWVgXgHd0tw6qB5fds3fiaZ//t/pzMlqXXpV0zrvAPbJRgadD5PM0llIDJakuKmW+3rO2ZJ6zUrXE6UmNy8BdLykU+Z1vp7tzqfp9mkM5DwvcjQYLS/tmqWT7TaVtxmWlcdPCb+qQHn8zSb1AYSKj1xX7TPtsjNv875PK6Frx7Zqz9nWht9Z+R5zmdqNqkOpFooRPtsk45Z0t611vVIzmRnXi7TVt1rWuv3dafuzNVYUh2I1LjPzt3QJa0Lljqdx5X2rnWSmR3/O5tgYNeUOHPmCoEcP1TTt+v5z3Q5nceF8NXDurUafXaj83maWeoDCRU3rhEZGTKtDf8itt30becz1SJnPJBYefV58Y8ddj5PMyOQ8C/Gmyek9JsfmNfnb74zetaeLPJLbcfSy7vtn+pNVQTyPu/wvmj/bWX+ttqx3LwFpvceocqxvzmfp9kRyDlq2X9PlqPw1u89/D1bpFosOJ+n2RHIOcL9tz/wnGlt6yc+3/BtVvheyPLmvHKkV0qP1/a7MlMVgVzA+4vxd887ZkYfrTZKGGf4Xig2ryjFzd/n1WMMBHKBWvbh4bN3+B6gEazvgSrHD0RfmOLSCOQC4TNpaUOX+QjKtFX3Sqa9w/lc47Ge2C3vecz5LFMJgVyCd3iveM+sN63NzJ4ruYXLnM+ksZ7YjQ4kvmT85bGUIJAxjPZuNb+K1PsTLesnV7VexCINCGQM4b688qrtQgW5On4nYj2xWz15gAOJE0AginLvo6Z14TarHsdPrCd2q4MnpLDJfnIgTQhE4R/uNR9kzN98Z+KfaFlP7JYe/pEEw6cTmWmqIZBxlB7+sXmt9b3BRFhP7AZvDXAgMQYCGUf4l6m8/eemtUme9LWe2C0/z8e6cRDIBIzueMh8kDGpk77WE7t+/zbns0xlBDJBxd/dZV7r+qRvfvFy03sPv+d+jpTERCATFAwNmq+l5fp7kfwXb4+9ploYltLOh5zOkQYEEkO5z/b76y5P+rZ3rY2u7hhLxZPS5ls5c2VAIDF4+56Mnoljc3TSt2XO3OhCEXFVBv4q/pGXav75aUQgcXie+D0bTEtdnPS1btX8V9J56wIXCCSm0u5HxH/5adPaWk/6ti4xfHI1MiTei8+af2baEUhcnifFTbebroJSy0nf/OLlku1cGHtdceN3pPqff5t+JgjEzH/tBdM66zbJ8slVMLCLb81rRCBGlaMHTOtarpgfe83Z7z1ifnLFReCcIBAjr39bdCo2rmznImm9buK3o26ZMze6H0lco4/+hFcPBwjEqFociU7FWkxb/UB0x9mJiG751jEr1uMHpw5K+bk/mmbD+QikBuEztPWM1kTuOBu+eli2Vj4XgHOGQGpU2HyH6QhK7sbV4261cku/Gn8grygeF2JwhkBqVC0WzLdzC7da0to65n/PG65U4j2zXoK3/2maBxcjEAeig4zHbR/75jqvvuSfR/cdifneo1oYPnuxCThDII54R/aa1mU/ev0l/9xy3xG/ZwMHEh0jEEf8/u2mywS1LvjsRX9mucZucOqQlHYaL5uKMRGII8G7b8lo9w9jr8t2LjzvNm7Wa+x6/U/GXoPxEYhD5f09pqugnLudsv72of9Kr2kddATimOVqheF2KnvFh87++wc/Hnt9MLBLgtOvx16H8RGIY+GrSOXovtjr2r+1XjKXzZDWT66Ita46eEKK3ZP3dtTNjkAS4PX9IfaabOci6bh7l2RmfCDWusKvv8dF4BJEIAmoHH3RtC4zfWas/9/fs0WCN/hSMEkEkoDwGX20+45Ef0b1vbe5bVodEEhCyn1PRc/wSfH6H+caV3VAIAka3fFgYo9d+YftZqOIh0ASFAwNirftbuePWy0Mi39qwPnj4mIEkrDS7kckOH3c3QOODEUXgRPPc/eYGBOBJM3zxDvwlLOHG33sLi4CV0cEUgeVlx3d6mxkiNum1RmB1EHlzRPi93XX/Djl3t87mQcTRyB1UuxeF904sxYeN7+pOwKpo9LWe8xrowOJHCmpOwKpI//YYdO1tISLwDUMgdRZ8O4bsdd4PfdzEbgGIZBmUDjT6AlSi0AABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEECRa/QAaRP865BkquV4a4bfSWwe6AikzkpP/LLRIyAGtliAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQAAFgQAKAgEUBAIoCARQEAigIBBAQSCAgkAABYEACgIBFAQCKAgEUBAIoCAQQEEggIJAAAWBAAoCARQEAigIBFAQCKAgEEBBIICCQABFTkROikhvowcBJqGT/wsAAP//8+CVXhPhYlQAAAAASUVORK5CYII="
19-
13+ from models import Story , StoryType
2014
2115class HackernewsRateLimitException (Exception ):
2216 pass
2317
2418
25- class RcFile :
26- def __init__ (self , _queries ):
27- self .queries = _queries
28-
29-
30- class HackerNewsStory :
19+ class HackerNewsStoryDto :
3120 def __init__ (self , _title , _url , _hackerNewsUrl , _lastSeen , _postedDate ):
3221 self .title = _title
3322 self .url = _url
@@ -46,13 +35,7 @@ def getLogger(self):
4635 rootLogger = logging .getLogger ('root' )
4736 return rootLogger
4837
49- def getIconPath (self ):
50- iconBytes = base64 .b64decode (HN_LOGO_B64 )
51- tempFolder = tempfile .gettempdir ()
52- iconPath = os .path .join (tempFolder , '__hnicon.png' )
53- with open (iconPath , 'wb' ) as iconFile :
54- iconFile .write (iconBytes )
55- return 'file://' + iconPath
38+
5639
5740 @backoff .on_exception (backoff .fibo , HackernewsRateLimitException )
5841 def processPage (self , pageNumber ):
@@ -74,7 +57,7 @@ def processPage(self, pageNumber):
7457 if not (len (hackerNewsUrls ) == len (titles ) == len (urls ) == len (dates )):
7558 raise Exception (f"Error in parsing page { pageNumber } : length of parsed elements is different. Hackernewsurls: { len (hackerNewsUrls )} Titles: { len (titles )} Urls: { len (urls )} Dates: { len (dates )} \n \n #Hackernewsurls\n { hackerNewsUrls } \n \n #Titles\n { titles } \n \n #Urls\n { urls } \n \n #Dates\n { dates } " )
7659
77- return seq (zip (titles , urls , hackerNewsUrls , dates )).map (lambda x : HackerNewsStory (x [0 ], x [1 ], x [2 ], datetime .now (timezone .utc ), x [3 ])).to_list ()
60+ return seq (zip (titles , urls , hackerNewsUrls , dates )).map (lambda x : HackerNewsStoryDto (x [0 ], x [1 ], x [2 ], datetime .now (timezone .utc ), x [3 ])).to_list ()
7861
7962
8063 def readRcFile (self ):
@@ -104,40 +87,48 @@ def generateRss(self, stories, useHackernewsUrl=False):
10487 rssPath .write (rss )
10588
10689
90+ def cleanupOldStories (self ):
91+ Story .delete ().where (Story .last_seen < datetime .now (timezone .utc ) - timedelta (days = 7 )).execute ()
92+
10793 def getOldStories (self ):
108- storiesDbPath = Path . home () / ".hackernewsddb"
109- if os . path . exists ( storiesDbPath ):
110- with open ( storiesDbPath , "r" , encoding = "utf-8" ) as dbFile :
111- return jsonpickle . decode ( dbFile . read () )
112- else :
94+ try :
95+ all_stories = list ( Story . select (). where ( Story . type == StoryType . Hackernews . value ))
96+ print ( all_stories )
97+ return seq ( all_stories ). map ( lambda s : self . entityToDto ( s )). to_list ( )
98+ except Exception as e :
11399 return []
114100
115- def writeOldStories (self , stories ):
116- storiesDbPath = Path .home () / ".hackernewsddb"
117- with open (storiesDbPath , "w" , encoding = "utf-8" ) as dbFile :
118- dbFile .write (jsonpickle .encode (stories ))
101+ def entityToDto (self , entity : Story ) -> HackerNewsStoryDto :
102+ return HackerNewsStoryDto (entity .title , entity .url , entity .hnurl , entity .last_seen , entity .posted_date )
103+
104+ def dtoToEntity (self , dto :HackerNewsStoryDto ) -> Story :
105+ return Story (title = dto .title , url = dto .url ,hnurl = dto .hackerNewsUrl , last_seen = dto .lastSeen , posted_date = dto .postedDate , type = StoryType .Hackernews .value )
106+
107+
108+ def insertNewStories (self , stories ):
109+ for s in stories :
110+ entity = self .dtoToEntity (s )
111+ entity .save ()
119112
120113 def scrape (self ):
121- self .ICON_PATH = self .getIconPath ()
122114 self .logger = self .getLogger ()
123115
124116 try :
125117 stopwatch = Stopwatch ()
118+ self .cleanupOldStories ()
126119 oldStories = self .getOldStories ()
127120 rcFile = self .readRcFile ()
128121 queries = json .loads (rcFile )["queries" ]
129- allStories = seq .range (1 , 30 ).flat_map (lambda p : self .processPage (p )).to_list ()
130- filteredStories = seq (allStories ).filter (lambda x : seq (queries ).map (lambda q : x .title .lower ().find (q .lower ()) != - 1 ).any ()).to_list ()
122+ allCurrentStories = seq .range (1 , 30 ).flat_map (lambda p : self .processPage (p )).to_list ()
123+ filteredStories = seq (allCurrentStories ).filter (lambda x : seq (queries ).map (lambda q : x .title .lower ().find (q .lower ()) != - 1 ).any ()).to_list ()
131124 diffStories = seq (filteredStories ).filter (lambda x : not seq (oldStories ).filter (lambda y : y == x ).any ()).to_list ()
125+ self .insertNewStories (diffStories )
132126
133-
134- updatedOldStories = seq (oldStories + diffStories ).filter (lambda s : (datetime .now (timezone .utc ) - s .lastSeen ).days < 7 ).to_list ()
135- self .writeOldStories (updatedOldStories )
136- self .generateRss (updatedOldStories )
137- self .generateRss (updatedOldStories , True )
127+ self .generateRss (self .getOldStories ())
128+ self .generateRss (self .getOldStories (), True )
138129
139130 stopwatch .stop ()
140131 self .logger .info (f"It took { str (stopwatch )} for a full cycle." )
141132 except Exception as e :
142133 self .logger .error (f"Unhandled exception occurred" , exc_info = True )
143- print (e )
134+ print (e )
0 commit comments