diff --git a/.gitignore b/.gitignore index 894a44c..32a6946 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,11 @@ venv.bak/ # mypy .mypy_cache/ + +.idea/ + +*.db + +*.fb2 + +*.html diff --git a/Json structure.md b/Json structure.md new file mode 100644 index 0000000..57cb35d --- /dev/null +++ b/Json structure.md @@ -0,0 +1,21 @@ +{ + + "Source:":string, + + "Feeds": [{"title": string, + + "date": string, + + "link":string, + + "description": string, + + "media": [{"url": string, + + "type": string}], + + "links": [{"url": string, + + "type": "string"}] + }] +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ab35068 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +feedparser~=5.2 +requests~=2.22 diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py new file mode 100644 index 0000000..51a024b --- /dev/null +++ b/rss_reader/__main__.py @@ -0,0 +1,4 @@ +from .rss_reader import main + +if __name__ == '__main__': + main() diff --git a/rss_reader/cache.py b/rss_reader/cache.py new file mode 100644 index 0000000..04ba10e --- /dev/null +++ b/rss_reader/cache.py @@ -0,0 +1,72 @@ +import sqlite3 +import logging + + +file_path = 'cache.db' + + +class Cache: + """"This class contains news and methods of work whit cache""" + cursor = None + conn = None + + def __init__(self): + """This method initialize cursor to database""" + if self.cursor is None: + Cache._init_cursor() + else: + logger = logging.getLogger('rss_reader') + logger.error("This is singleton class. Use get_cursor") + + @staticmethod + def _init_cursor(): + Cache.conn = sqlite3.connect(file_path) + Cache.cursor = Cache.conn.cursor() + Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS news(id INTEGER PRIMARY KEY, + title text, pub_date_key numeric, pub_date text, link text, description text, UNIQUE(link))''') + Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS links( id INTEGER PRIMARY KEY, + link text, news numeric)''') + Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS media( id INTEGER PRIMARY KEY, + link text, news numeric)''') + + @staticmethod + def get_cursor(): + """Static access method. """ + if Cache.cursor is None: + Cache() + return Cache.cursor + + @staticmethod + def commit(): + """This method commit to database database""" + return Cache.conn.commit() + + @staticmethod + def close(): + """This method close connection to database""" + return Cache.conn.close() + + @staticmethod + def print_news(date): + """This method print news to std from selected date to database""" + Cache.get_cursor() + Cache.cursor.execute('''SELECT * FROM news WHERE pub_date_key = ?''', (date,)) + news = Cache.cursor.fetchall() + if len(news) == 0: + return 1 + for elem in news: + print('\nTitle: ', elem[1]) + print('Date: ', elem[3]) + print('Link: ', elem[4]) + print(f'Description: {elem[5]}\n') + Cache.cursor.execute('''SELECT * FROM links WHERE news= ?''', (elem[0],)) + links = Cache.cursor.fetchall() + i = 1 + for link in links: + print(f'Link[{i}]: ', link[1]) + i = i + 1 + Cache.cursor.execute('''SELECT * FROM media WHERE news= ?''', (elem[0],)) + links = Cache.cursor.fetchall() + for link in links: + print(f'Link[{i}]: ', link[1]) + i = i + 1 diff --git a/rss_reader/news.py b/rss_reader/news.py new file mode 100644 index 0000000..68e9c46 --- /dev/null +++ b/rss_reader/news.py @@ -0,0 +1,186 @@ +import html +import os +import re +import json +import logging +from .cache import Cache +import base64 +import requests + + +class News: + """This class contains news and methods of work whit news""" + + http_header = 'http' + err_media_type = 'No type' + + def __init__(self, feeds_dict, limit): + + logger = logging.getLogger('rss_reader') + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler = logging.FileHandler('rss_reader_logs.log') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger.setLevel(logging.INFO) + self.news = dict() + self.all_news = list() + + self.name_of_source = feeds_dict.feed['title'] + + real_limit = len(feeds_dict.entries) + if limit > 0: + if limit < len(feeds_dict.entries): + real_limit = limit + + cursor = Cache.get_cursor() + + for i in range(real_limit): + list_to_cache = list() + self.news['title'] = html.unescape(feeds_dict.entries[i].title) + self.news['date'] = html.unescape(feeds_dict.entries[i].published) + self.news['link'] = html.unescape(feeds_dict.entries[i].link) + self.news['description'] = self.clean_from_tags(html.unescape(feeds_dict.entries[i].description)) + + date_dict = feeds_dict.entries[i].published_parsed + date_str = str(date_dict.tm_year) + str(date_dict.tm_mon) + str(date_dict.tm_mday) + + list_to_cache.append(self.news['title']) + list_to_cache.append(date_str) + list_to_cache.append(self.news['date']) + list_to_cache.append(self.news['link']) + list_to_cache.append(self.news['description']) + + self.news['media'] = self._parse_media(feeds_dict.entries[i]) + self.news['links'] = self._parse_links(feeds_dict.entries[i]) + + self._cache_feed(list_to_cache, self.news['links'], self.news['media'], cursor) + + self.all_news.append(self.news.copy()) + Cache.close() + + @staticmethod + def _parse_links(news_dict): + """This function parse links of feed""" + list_of_links = list() + if news_dict.links: + for elem in news_dict.links: + list_of_links.append({'url': elem.setdefault('url', None), 'type': elem.setdefault('type', None)}) + return list_of_links + + def _parse_media(self, news_dict): + """This function parse media of feed""" + if news_dict.setdefault('media_content', None): + media = list() + if news_dict.media_content: + for elem in news_dict.media_content: + if elem['url'].rfind(self.http_header, 0, len(elem['url'])) > 0: + # Some sources of news write two links in one string of media. And only second string is image + links = elem['url'].split(self.http_header) + media.append({'url': self.http_header + links[2], 'type': "img"}) + else: + if elem.setdefault('url', None): + media.append({'url': elem.setdefault('url', None), + 'type': elem.setdefault('type', None)}) + return media + else: + return '' + + def _cache_feed(self, list_of_main_info, list_of_links, list_of_media, cursor): + """This function write feed to cache""" + cursor.execute('''INSERT or IGNORE INTO news (title, pub_date_key, pub_date, link, description) + VALUES(?,?,?,?,?)''', list_of_main_info) + ids = cursor.lastrowid + + list_to_cache_of_links = list() + for elem in list_of_links: + list_to_cache_of_links.append(elem.setdefault('url', None)) + list_to_cache_of_links.append(ids) + cursor.execute('''INSERT or IGNORE INTO links (link, news) VALUES(?,?)''', list_to_cache_of_links) + list_to_cache_of_links.clear() + + list_to_cache_of_media = list() + for elem in list_of_media: + list_to_cache_of_media.append(elem.setdefault('url', None)) + list_to_cache_of_media.append(ids) + cursor.execute('''INSERT or IGNORE INTO media (link, news) VALUES(?,?)''', list_to_cache_of_media) + list_to_cache_of_media.clear() + + Cache.commit() + + @staticmethod + def clean_from_tags(text_with_tags): + """This function delete tags from string""" + return re.sub('<.*?>', '', text_with_tags) + + def print(self): + """This function print news to stdout in readable format""" + print(f'Source: {self.name_of_source}\n') + for elem in self.all_news: + print(f'Title: {elem["title"]}') + print(f'Date: {elem["date"]}') + print(f'Link: {elem["link"]}') + print(f'Description: {elem["description"]}\n') + + j = 1 + print('Links: ') + for link in elem['links']: + print(f'[{j}] {link["url"]} ({link["type"]})') + j = j + 1 + + if elem.setdefault('media', None): + print("Media: ") + for media in elem['media']: + print(f'[{j}] {media["url"]} ({media["type"]})') + j = j + 1 + + def to_json(self): + """This function returns JSON-string with news""" + return json.dumps({'Source:': self.name_of_source, 'Feeds': self.all_news}, ensure_ascii=False).encode('utf8') + + def create_fb2(self, filepath): + if filepath[-4::] != ".fb2": + filename = filepath + ".fb2" + with open(filename, 'w', encoding="utf-8") as fb2_file: + fb2_file.write('\n') + fb2_file.write(f'''''') + fb2_file.write(f'''<p>{self.name_of_source.replace("&", "&")}</p>''') + for elem in self.all_news: + fb2_file.write(f'
<p>{elem["title"].replace("&", "&")}</p>') + fb2_file.write(f'

Date of posting: {elem["date"].replace("&", "&")}

') + fb2_file.write(f'

{elem["description"].replace("&", "&")}

') + fb2_file.write(f'

Source: {elem["link"]}

'.replace("&", "&")) + + for media in elem['media']: + if media['type'] != self.err_media_type: + fb2_file.write(f''' + ''') + pass + fb2_file.write('') + for elem in self.all_news: + for media in elem['media']: + if media['type'] != self.err_media_type: + fb2_file.write(f'') + content = base64.b64encode(requests.get(media["url"]).content) + fb2_file.write(content.decode('ascii')) + fb2_file.write('') + fb2_file.write('
') + + print(f'All news you can find at {os.path.realpath(filename)}') + + def create_html(self, filepath): + if filepath[-5::] != ".html": + filename = filepath + ".html" + with open(filename, 'w', encoding="utf-8") as html_file: + html_file.write(f'\n{self.name_of_source}\n\n') + for elem in self.all_news: + html_file.write(f'

{elem["title"]}

') + html_file.write(f'

Date of posting: {elem["date"]}

') + html_file.write(f'

{elem["description"]}

') + html_file.write(f'

Link to source

') + + for media in elem['media']: + if media['type'] != self.err_media_type: + html_file.write(f'

') + html_file.write('
') + html_file.write('') + print(f'All news you can find at {os.path.realpath(filename)}') diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py new file mode 100644 index 0000000..2028ab3 --- /dev/null +++ b/rss_reader/rss_reader.py @@ -0,0 +1,77 @@ +import argparse +import feedparser +from .news import News +from .cache import Cache +import logging +import sys + + +version = '1.5' + + +def main(): + """Main function of program""" + + logger = logging.getLogger('rss_reader') + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler = logging.FileHandler('rss_reader_logs.log') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger.setLevel(logging.INFO) + + parser = argparse.ArgumentParser(description='Python RSS-reader') + parser.add_argument('URL', type=str, help='RSS URL') + parser.add_argument('--version', help='Print version info', action='version', version=version) + parser.add_argument('--json', help='Print result as JSON in stdout', action='store_true') + parser.add_argument('-V', '--verbose', help='Outputs verbose status messages', action='store_true') + parser.add_argument('-L', '--limit', help='Limit news topics if this parameter is provided', type=int, default=0) + parser.add_argument('--date', help='Find news in cache if this parameter is provided', type=int, default=0) + parser.add_argument('--to-html', help='Create a HTML file with news', type=str, default="") + parser.add_argument('--to-fb2', help='Create a fb2 file with news', type=str, default="") + args = parser.parse_args() + + if args.verbose: + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setFormatter(formatter) + logger.addHandler(stdout_handler) + + if args.date: + logger.info('Starting to read from cache') + state = Cache.print_news(args.date) + if state == 1: + print(f'''There are not exist news with date of publication at {args.date} + \nMake sure that your format date in %Y%m%d''', file=sys.stderr) + else: + logger.info('News from cache ware read') + else: + + feeds = feedparser.parse(args.URL) + + if feeds.bozo: + print('This is not well formed XML', file=sys.stderr) + exit() + + else: + logger.info('The XML file with news is received and correct') + + news = News(feeds, args.limit) + logger.info('News is parsed') + + if args.to_html: + news.create_html(args.to_html) + + elif args.to_fb2: + news.create_fb2(args.to_fb2) + + elif args.json: + print(news.to_json().decode()) + logger.info('News is displayed in stdout in a json format') + else: + news.print() + logger.info('News is displayed in stdout in a readability format') + + logger.info('Program is over') + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6b7fd40 --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +import setuptools + +with open('requirements.txt') as f: + required = f.read().splitlines() + +setuptools.setup( + name='rss_reader', + version='1.5', + author='Boris Dashko', + author_email='borya.dashko@gmail.com', + url='https://github.com/BoryaD/PythonHomework/tree/FinalTask', + packages=setuptools.find_packages(), + python_requires='>=3.8', + install_requires=required, + entry_points={ + 'console_scripts': [ + 'rss-reader = rss_reader.rss_reader:main', + ], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cache_tests.py b/tests/cache_tests.py new file mode 100644 index 0000000..6e01a14 --- /dev/null +++ b/tests/cache_tests.py @@ -0,0 +1,27 @@ +import unittest +from rss_reader.cache import Cache + + +class CacheTest(unittest.TestCase): + + def setUp(self): + pass + + def test_close(self): + pass + + def test_commit(self): + pass + + def test_print_news(self): + pass + + def test_get_cursor(self): + pass + + def test_init_cursor(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/news_tests.py b/tests/news_tests.py new file mode 100644 index 0000000..0edcd0a --- /dev/null +++ b/tests/news_tests.py @@ -0,0 +1,31 @@ +import unittest +from rss_reader.news import News + + +class NewsTest(unittest.TestCase): + + def setUp(self): + self.useful_data = "DataClear" + self.data_with_tags = "DataClear" + + def test_clear_from_tags(self): + self.assertEqual(self.useful_data, News.clean_from_tags(self.data_with_tags)) + + def test_parse_links(self): + pass + + def test_parse_media(self): + pass + + def test_create_fb2(self): + pass + + def test_create_html(self): + pass + + def test_to_json(self): + pass + + +if __name__ == '__main__': + unittest.main()