Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
7bb0c76
fix cleaning the wrong top node
idoshamun Jun 17, 2021
e4beb8f
add ul and ol to the list of nodes to check
idoshamun Jul 28, 2021
a626a41
deep copy top node in the output formatter
idoshamun Jul 28, 2021
a5b15ff
add support for section based articles
idoshamun Oct 19, 2021
a737681
fix medium heuristic for finding the content node
idoshamun Oct 25, 2021
a169a56
don't clean the default doc variable instead use clean_doc
idoshamun Oct 26, 2021
c01bdec
force medium heuristic on medium articles only
idoshamun Oct 26, 2021
a4dcbe4
fallback to clean doc if it wasn't possible to find top node
idoshamun Oct 26, 2021
7b95eb6
add fallback heuristics when can't find top node
idoshamun Oct 27, 2021
0655176
allow_redirects config option (#2)
vpol Jul 14, 2022
e149436
feat: allow to ignore certain basename regex (#3)
vpol Apr 13, 2023
7d34fc9
fix: more places where we have to check images
vpol Apr 13, 2023
b75385a
fix: type conversion list -> set
vpol Apr 25, 2023
d25229e
chore: set lxml version to supported (#6)
denisb0 Apr 3, 2024
46fed32
feat: add a fallback to image download fail case (#7)
denisb0 May 14, 2025
bc8fd32
feat: add final_url property to article
idoshamun Aug 25, 2025
e69b5f8
feat: added verify_ssl_cert option to pass to requests for proxy use
vpol Nov 30, 2025
c4af991
Merge pull request #9 from dailydotdev/feat_verify_ssl_cert_option
vpol Nov 30, 2025
cfa7b65
feat: use og:title if title is absent
vpol Mar 31, 2026
4e8b3f4
Merge pull request #10 from dailydotdev/feat_og_title
vpol Mar 31, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 33 additions & 9 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def __init__(self, url, title='', source_url='', config=None, **kwargs):
# A property dict for users to store custom data.
self.additional_data = {}

# The final URL after redirects and meta refresh
self.final_url = None

def build(self):
"""Build a lone article from a URL independent of the source (newspaper).
Don't normally call this method b/c it's good to multithread articles
Expand All @@ -173,7 +176,9 @@ def _parse_scheme_file(self, path):

def _parse_scheme_http(self):
try:
return network.get_html_2XX_only(self.url, self.config)
html, final_url = network.get_html_2XX_only(self.url, self.config, return_final_url=True)
self.final_url = final_url
return html
except requests.exceptions.RequestException as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = str(e)
Expand All @@ -190,18 +195,27 @@ def download(self, input_html=None, title=None, recursion_counter=0):
parsed_url = urlparse(self.url)
if parsed_url.scheme == "file":
html = self._parse_scheme_file(parsed_url.path)
# For file scheme, the final URL is the same as the initial URL
if self.final_url is None:
self.final_url = self.url
else:
html = self._parse_scheme_http()
# final_url is already set in _parse_scheme_http
if html is None:
log.debug('Download failed on URL %s because of %s' %
(self.url, self.download_exception_msg))
return
else:
html = input_html
# If HTML is provided directly and final_url not set, use the current URL
if self.final_url is None:
self.final_url = self.url

if self.config.follow_meta_refresh:
meta_refresh_url = extract_meta_refresh(html)
if meta_refresh_url and recursion_counter < 1:
# Update final_url to the meta refresh URL
self.final_url = meta_refresh_url
return self.download(
input_html=network.get_html(meta_refresh_url),
recursion_counter=recursion_counter + 1)
Expand All @@ -213,19 +227,22 @@ def parse(self):
self.throw_if_not_downloaded_verbose()

self.doc = self.config.get_parser().fromstring(self.html)
self.clean_doc = copy.deepcopy(self.doc)

if self.doc is None:
# `parse` call failed, return nothing
return

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

self.clean_doc = copy.deepcopy(self.doc)
# Before any computations on the body, clean DOM object
self.clean_doc = document_cleaner.clean(self.clean_doc)

# TODO: Fix this, sync in our fix_url() method
parse_candidate = self.get_parse_candidate()
self.link_hash = parse_candidate.link_hash # MD5

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

title = self.extractor.get_title(self.clean_doc)
self.set_title(title)

Expand Down Expand Up @@ -267,16 +284,23 @@ def parse(self):
self.url,
self.clean_doc)

# Before any computations on the body, clean DOM object
self.doc = document_cleaner.clean(self.doc)

self.top_node = self.extractor.calculate_best_node(self.doc)
if self.top_node is None:
self.top_node = self.extractor.calculate_best_node(self.clean_doc)
if self.top_node is None:
self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
if self.top_node is None:
for tag in ['article', 'main']:
nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
if len(nodes) > 0:
self.top_node = nodes[0]
break
if self.top_node is not None:
video_extractor = VideoExtractor(self.config, self.top_node)
self.set_movies(video_extractor.get_videos())

self.top_node = self.extractor.post_cleanup(self.top_node)
self.clean_top_node = copy.deepcopy(self.top_node)
self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node)

text, article_html = output_formatter.get_formatted(
self.top_node)
Expand Down
5 changes: 5 additions & 0 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def __init__(self):
# Fail for error responses (e.g. 404 page)
self.http_success_only = True

# Allow redirects (enabled by default)
self.allow_redirects = True

self.ignored_images_suffix_list = []
# English is the fallback
self._language = 'en'

Expand All @@ -68,6 +72,7 @@ def __init__(self):
self.request_timeout = 7
self.proxies = {}
self.number_threads = 10
self.verify_ssl_cert = True

self.verbose = False # for debugging

Expand Down
49 changes: 34 additions & 15 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import copy
import logging
import os.path
import re
import re
from collections import defaultdict
Expand Down Expand Up @@ -253,12 +254,19 @@ def get_title(self, doc):
"""
title = ''
title_element = self.parser.getElementsByTag(doc, tag='title')
# no title found
if title_element is None or len(title_element) == 0:
return title
title_text_fb = (
self.get_meta_content(doc, 'meta[property="og:title"]') or
self.get_meta_content(doc, 'meta[name="og:title"]') or ''
)

# title elem found
title_text = self.parser.getText(title_element[0])
# no title found, fallback to og:title
if title_element is None or len(title_element) == 0:
title_text = title_text_fb
if not title_text:
return title
else:
# title elem found
title_text = self.parser.getText(title_element[0])
used_delimeter = False

# title from h1
Expand All @@ -280,11 +288,6 @@ def get_title(self, doc):
# clean double spaces
title_text_h1 = ' '.join([x for x in title_text_h1.split() if x])

# title from og:title
title_text_fb = (
self.get_meta_content(doc, 'meta[property="og:title"]') or
self.get_meta_content(doc, 'meta[name="og:title"]') or '')

# create filtered versions of title_text, title_text_h1, title_text_fb
# for finer comparison
filter_regex = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9\ ]')
Expand Down Expand Up @@ -449,26 +452,34 @@ def get_meta_img_url(self, article_url, doc):
"""
top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
try_one = None if self.image_is_ignored(try_one) else try_one
if not try_one:
link_img_src_kwargs = \
{'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
try_two = elems[0].get('href') if elems else None

try_two = None if self.image_is_ignored(try_two) else try_two
if not try_two:
try_three = self.get_meta_content(doc, 'meta[name="og:image"]')

try_three = None if self.image_is_ignored(try_three) else try_three
if not try_three:
link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
try_four = elems[0].get('href') if elems else None
try_four = None if self.image_is_ignored(try_four) else try_four

top_meta_image = try_one or try_two or try_three or try_four

if top_meta_image:
return urljoin(article_url, top_meta_image)
return ''

def image_is_ignored(self, image):
return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))])

def match_image(self, pattern, image):
return re.search(pattern, image) is not None

def get_meta_type(self, doc):
"""Returns meta type of article, open graph protocol
"""
Expand Down Expand Up @@ -575,6 +586,7 @@ def get_img_urls(self, article_url, doc):
for img_tag in img_tags if img_tag.get('src')]
img_links = set([urljoin(article_url, url)
for url in urls])
img_links = set([x for x in img_links if not self.image_is_ignored(x)])
return img_links

def get_first_img_url(self, article_url, top_node):
Expand Down Expand Up @@ -1014,9 +1026,16 @@ def nodes_to_check(self, doc):
on like paragraphs and tables
"""
nodes_to_check = []
for tag in ['p', 'pre', 'td']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
articles = self.parser.getElementsByTag(doc, tag='article')
if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium':
# Specific heuristic for Medium articles
sections = self.parser.getElementsByTag(articles[0], tag='section')
if len(sections) > 1:
nodes_to_check = sections
if len(nodes_to_check) == 0:
for tag in ['p', 'pre', 'td', 'ol', 'ul']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
return nodes_to_check

def is_table_and_no_para_exist(self, e):
Expand Down
35 changes: 34 additions & 1 deletion newspaper/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,37 @@ def clean_url(url):
return url


def get_full_image_dimensions(image_url):
"""Fallback in case PIL can't open the streamed image
"""
try:
response = requests.get(image_url) # No stream=True needed
response.raise_for_status() # Raise an exception for bad status codes

# Use io.BytesIO to treat the response content (bytes) as a file
image_bytes = io.BytesIO(response.content)

# Open the image directly from the bytes stream
img = Image.open(image_bytes)

sz = img.size

# It's good practice to close the image when done
img.close()

return sz

except requests.exceptions.RequestException as e:
log.warning(f"Method 2 (Direct): Error fetching the image via requests: {e}")
return None
except FileNotFoundError:
log.warning("Method 2 (Direct): Error: io.BytesIO did not behave as expected (treated as file not found).")
return None
except Exception as e:
log.warning(f"Method 2 (Direct): An unexpected error occurred while opening image: {e}")
return None


def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
cur_try = 0
nothing = None if dimension else (None, None)
Expand Down Expand Up @@ -143,7 +174,9 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
if dimension and p.image:
return p.image.size
elif dimension:
return nothing
# we did read the image, but it failed to parse for some reason
# try to download it in one go
return get_full_image_dimensions(url)
elif dimension:
# expected an image, but didn't get one
return nothing
Expand Down
23 changes: 16 additions & 7 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@
FAIL_ENCODING = 'ISO-8859-1'


def get_request_kwargs(timeout, useragent, proxies, headers):
def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert):
"""This Wrapper method exists b/c some values in req_kwargs dict
are methods which need to be called every time we make a request
"""
return {
'headers': headers if headers else {'User-Agent': useragent},
'cookies': cj(),
'timeout': timeout,
'allow_redirects': True,
'proxies': proxies
'allow_redirects': allow_redirects,
'proxies': proxies,
'verify': verify_ssl_cert,
}


Expand All @@ -44,7 +45,7 @@ def get_html(url, config=None, response=None):
return ''


def get_html_2XX_only(url, config=None, response=None):
def get_html_2XX_only(url, config=None, response=None, return_final_url=False):
"""Consolidated logic for http requests from newspaper. We handle error cases:
- Attempt to find encoding of the html by using HTTP header. Fallback to
'ISO-8859-1' if not provided.
Expand All @@ -55,19 +56,27 @@ def get_html_2XX_only(url, config=None, response=None):
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
verify_ssl_cert = config.verify_ssl_cert
allow_redirects = config.allow_redirects

if response is not None:
return _get_html_from_response(response, config)
html = _get_html_from_response(response, config)
if return_final_url:
return html, getattr(response, 'url', url)
return html

response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert))

html = _get_html_from_response(response, config)
final_url = response.url

if config.http_success_only:
# fail if HTTP sends a non 2XX response
response.raise_for_status()

if return_final_url:
return html, final_url
return html


Expand Down Expand Up @@ -107,7 +116,7 @@ def __init__(self, url, config=None):
def send(self):
try:
self.resp = requests.get(self.url, **get_request_kwargs(
self.timeout, self.useragent, self.proxies, self.headers))
self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects))
if self.config.http_success_only:
self.resp.raise_for_status()
except requests.exceptions.RequestException as e:
Expand Down
3 changes: 2 additions & 1 deletion newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from html import unescape
import logging
import copy

from .text import innerTrim

Expand Down Expand Up @@ -42,7 +43,7 @@ def get_formatted(self, top_node):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
self.top_node = copy.deepcopy(top_node)
html, text = '', ''

self.remove_negativescores_nodes()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cssselect>=0.9.2
feedfinder2>=0.0.4
feedparser>=5.2.1
jieba3k>=0.35.1
lxml>=3.6.0
lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html
nltk>=3.2.1
Pillow>=3.3.0
pythainlp>=1.7.2
Expand Down
Loading